diff options
Diffstat (limited to 'fs/btrfs')
94 files changed, 15347 insertions, 12735 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 23537bc8c827..575636f6491e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,7 +2,12 @@ config BTRFS_FS tristate "Btrfs filesystem support" + select CRYPTO + select CRYPTO_CRC32C select LIBCRC32C + select CRYPTO_XXHASH + select CRYPTO_SHA256 + select CRYPTO_BLAKE2B select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..82200dbca5ac 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o delalloc-space.o block-group.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 5810463dc6d2..a0af1b952c4d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -93,7 +93,11 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_setxattr(trans, inode, name, value, size, 0); + if (trans) + ret = btrfs_setxattr(trans, inode, name, value, size, 0); + else + ret = btrfs_setxattr_trans(inode, name, value, size, 0); + out: kfree(value); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 122cb97c7909..1d32a07bb2d1 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -12,9 +12,11 @@ #include "async-thread.h" #include "ctree.h" -#define WORK_DONE_BIT 0 -#define WORK_ORDER_DONE_BIT 1 -#define WORK_HIGH_PRIO_BIT 2 +enum { + WORK_DONE_BIT, + WORK_ORDER_DONE_BIT, + WORK_HIGH_PRIO_BIT, +}; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) @@ -51,24 +53,12 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static void normal_work_helper(struct btrfs_work *work); - -#define BTRFS_WORK_HELPER(name) \ -noinline_for_stack void btrfs_##name(struct work_struct *arg) \ -{ \ - struct btrfs_work *work = container_of(arg, struct btrfs_work, \ - normal_work); \ - normal_work_helper(work); \ -} - -struct btrfs_fs_info * -btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) { return wq->fs_info; } -struct btrfs_fs_info * -btrfs_work_owner(const struct btrfs_work *work) +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) { return work->wq->fs_info; } @@ -87,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; } -BTRFS_WORK_HELPER(worker_helper); -BTRFS_WORK_HELPER(delalloc_helper); -BTRFS_WORK_HELPER(flush_delalloc_helper); -BTRFS_WORK_HELPER(cache_helper); -BTRFS_WORK_HELPER(submit_helper); -BTRFS_WORK_HELPER(fixup_helper); -BTRFS_WORK_HELPER(endio_helper); -BTRFS_WORK_HELPER(endio_meta_helper); -BTRFS_WORK_HELPER(endio_meta_write_helper); -BTRFS_WORK_HELPER(endio_raid56_helper); -BTRFS_WORK_HELPER(endio_repair_helper); -BTRFS_WORK_HELPER(rmw_helper); -BTRFS_WORK_HELPER(endio_write_helper); -BTRFS_WORK_HELPER(freespace_write_helper); -BTRFS_WORK_HELPER(delayed_meta_helper); -BTRFS_WORK_HELPER(readahead_helper); -BTRFS_WORK_HELPER(qgroup_rescan_helper); -BTRFS_WORK_HELPER(extent_refs_helper); -BTRFS_WORK_HELPER(scrub_helper); -BTRFS_WORK_HELPER(scrubwrc_helper); -BTRFS_WORK_HELPER(scrubnc_helper); -BTRFS_WORK_HELPER(scrubparity_helper); - static struct __btrfs_workqueue * __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -250,16 +217,16 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq) +static void run_ordered_work(struct __btrfs_workqueue *wq, + struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; struct btrfs_work *work; spinlock_t *lock = &wq->list_lock; unsigned long flags; + bool free_self = false; while (1) { - void *wtag; - spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -285,22 +252,53 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) list_del(&work->ordered_list); spin_unlock_irqrestore(lock, flags); - /* - * We don't want to call the ordered free functions with the - * lock held though. Save the work as tag for the trace event, - * because the callback could free the structure. - */ - wtag = work; - work->ordered_free(work); - trace_btrfs_all_work_done(wq->fs_info, wtag); + if (work == self) { + /* + * This is the work item that the worker is currently + * executing. + * + * The kernel workqueue code guarantees non-reentrancy + * of work items. I.e., if a work item with the same + * address and work function is queued twice, the second + * execution is blocked until the first one finishes. A + * work item may be freed and recycled with the same + * work function; the workqueue code assumes that the + * original work item cannot depend on the recycled work + * item in that case (see find_worker_executing_work()). + * + * Note that different types of Btrfs work can depend on + * each other, and one type of work on one Btrfs + * filesystem may even depend on the same type of work + * on another Btrfs filesystem via, e.g., a loop device. + * Therefore, we must not allow the current work item to + * be recycled until we are really done, otherwise we + * break the above assumption and can deadlock. + */ + free_self = true; + } else { + /* + * We don't want to call the ordered free functions with + * the lock held. + */ + work->ordered_free(work); + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); + } } spin_unlock_irqrestore(lock, flags); + + if (free_self) { + self->ordered_free(self); + /* NB: self must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, self); + } } -static void normal_work_helper(struct btrfs_work *work) +static void btrfs_work_helper(struct work_struct *normal_work) { + struct btrfs_work *work = container_of(normal_work, struct btrfs_work, + normal_work); struct __btrfs_workqueue *wq; - void *wtag; int need_order = 0; /* @@ -314,29 +312,26 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; - /* Safe for tracepoints in case work gets freed by the callback */ - wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); work->func(work); if (need_order) { set_bit(WORK_DONE_BIT, &work->flags); - run_ordered_work(wq); + run_ordered_work(wq, work); + } else { + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); } - if (!need_order) - trace_btrfs_all_work_done(wq->fs_info, wtag); } -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free) +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free) { work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, uniq_func); + INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 7861c9feba5f..a4434301d84d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -29,49 +29,20 @@ struct btrfs_work { unsigned long flags; }; -#define BTRFS_WORK_HELPER_PROTO(name) \ -void btrfs_##name(struct work_struct *arg) - -BTRFS_WORK_HELPER_PROTO(worker_helper); -BTRFS_WORK_HELPER_PROTO(delalloc_helper); -BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); -BTRFS_WORK_HELPER_PROTO(cache_helper); -BTRFS_WORK_HELPER_PROTO(submit_helper); -BTRFS_WORK_HELPER_PROTO(fixup_helper); -BTRFS_WORK_HELPER_PROTO(endio_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); -BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); -BTRFS_WORK_HELPER_PROTO(endio_repair_helper); -BTRFS_WORK_HELPER_PROTO(rmw_helper); -BTRFS_WORK_HELPER_PROTO(endio_write_helper); -BTRFS_WORK_HELPER_PROTO(freespace_write_helper); -BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); -BTRFS_WORK_HELPER_PROTO(readahead_helper); -BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); -BTRFS_WORK_HELPER_PROTO(extent_refs_helper); -BTRFS_WORK_HELPER_PROTO(scrub_helper); -BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); -BTRFS_WORK_HELPER_PROTO(scrubnc_helper); -BTRFS_WORK_HELPER_PROTO(scrubparity_helper); - - struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free); +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); -struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 11459fe84a29..e5d85311d5d5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -791,7 +791,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, count = node->ref_mod * -1; break; default: - BUG_ON(1); + BUG(); } *total_refs += count; switch (node->type) { @@ -1460,17 +1460,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * - * This attempts to allocate a transaction in order to account for - * delayed refs, but continues on even when the alloc fails. + * This attempts to attach to the running transaction in order to account for + * delayed refs, but continues on even when no running transaction exists. * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - struct ulist *tmp = NULL; - struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; struct seq_list elem = SEQ_LIST_INIT(elem); @@ -1481,16 +1480,15 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) .share_count = 0, }; - tmp = ulist_alloc(GFP_NOFS); - roots = ulist_alloc(GFP_NOFS); - if (!tmp || !roots) { - ulist_free(tmp); - ulist_free(roots); - return -ENOMEM; - } + ulist_init(roots); + ulist_init(tmp); - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { + ret = PTR_ERR(trans); + goto out; + } trans = NULL; down_read(&fs_info->commit_root_sem); } else { @@ -1523,8 +1521,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) } else { up_read(&fs_info->commit_root_sem); } - ulist_free(tmp); - ulist_free(roots); +out: + ulist_release(roots); + ulist_release(tmp); return ret; } @@ -1747,7 +1746,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, else if (flags & BTRFS_EXTENT_FLAG_DATA) *flags_ret = BTRFS_EXTENT_FLAG_DATA; else - BUG_ON(1); + BUG(); return 0; } @@ -1912,13 +1911,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(fs_info->extent_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + } + + if (trans) btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - } else { + else down_read(&fs_info->commit_root_sem); - } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, @@ -1951,7 +1956,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: - if (!search_commit_root) { + if (trans) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans); } else { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 54d58988483a..777f61dc081e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp_ulist); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c new file mode 100644 index 000000000000..6934a5b8708f --- /dev/null +++ b/fs/btrfs/block-group.c @@ -0,0 +1,3189 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "block-group.h" +#include "space-info.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "disk-io.h" +#include "volumes.h" +#include "transaction.h" +#include "ref-verify.h" +#include "sysfs.h" +#include "tree-log.h" +#include "delalloc-space.h" + +/* + * Return target flags in extended format or 0 if restripe for this chunk_type + * is not in progress + * + * Should be called with balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Return reduced profile in chunk format. If profile changing is in progress + * (either running or paused) picks the target profile (if it's already + * available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices = fs_info->fs_devices->rw_devices; + u64 target; + u64 raid_type; + u64 allowed = 0; + + /* + * See if restripe for this chunk_type is in progress, if so try to + * reduce to the target profile + */ + spin_lock(&fs_info->balance_lock); + target = get_restripe_target(fs_info, flags); + if (target) { + /* Pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); + } + } + spin_unlock(&fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_array[raid_type].bg_flag; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); +} + +static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(fs_info, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + return get_alloc_profile(fs_info, orig_flags); +} + +void btrfs_get_block_group(struct btrfs_block_group *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group *cache) +{ + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); + kfree(cache->free_space_ctl); + kfree(cache); + } +} + +/* + * This adds the block group to the fs_info rb tree for the block group cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group, cache_node); + if (block_group->start < cache->start) { + p = &(*p)->rb_left; + } else if (block_group->start > cache->start) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->start) + info->first_logical_byte = block_group->start; + + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group *block_group_cache_tree_search( + struct btrfs_fs_info *info, u64 bytenr, int contains) +{ + struct btrfs_block_group *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group, cache_node); + end = cache->start + cache->length - 1; + start = cache->start; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->start)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) { + btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->start) + info->first_logical_byte = ret->start; + } + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * Return the block group that starts at or after bytenr + */ +struct btrfs_block_group *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 0); +} + +/* + * Return the block group that contains the given bytenr + */ +struct btrfs_block_group *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 1); +} + +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct rb_node *node; + + spin_lock(&fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->start + cache->length; + + spin_unlock(&fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + } + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group, cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&fs_info->block_group_cache_lock); + return cache; +} + +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* No put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_var(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) +{ + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_var(&bg->reservations); + btrfs_put_block_group(bg); +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); +} + +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +{ + if (refcount_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * When we wait for progress in the block group caching, its because our + * allocation attempt failed at least once. So, we must sleep and let some + * progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to show + * up, and then it will check the block group free space numbers for our min + * num_bytes. Another option is to have it go ahead and look in the rbtree for + * a free extent of a given size, but this is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. + */ +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return; + + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || + (cache->free_space_ctl->free_space >= num_bytes)); + + btrfs_put_caching_control(caching_ctl); +} + +int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) +{ + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; + btrfs_put_caching_control(caching_ctl); + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 start = block_group->start; + u64 len = block_group->length; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + fs_info->nodesize : fs_info->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + +/* + * This is only called by btrfs_cache_block_group, since we could have freed + * extents we need to check the pinned_extents for any extents that can't be + * used yet since their free space will be released as soon as the transaction + * commits. + */ +u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) +{ + struct btrfs_fs_info *info = block_group->fs_info; + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); /* -ENOMEM or logic error */ + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); /* -ENOMEM or logic error */ + } + + return total_added; +} + +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret; + bool wakeup = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); + +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(block_group)) + wakeup = false; +#endif + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + +next: + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); + if (ret) + break; + + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + + if (key.objectid < block_group->start) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->start + block_group->length) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + total_found += add_new_free_space(block_group, last, + key.objectid); + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->nodesize; + else + last = key.objectid + key.offset; + + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + if (wakeup) + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, last, + block_group->start + block_group->length); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->length - block_group->used; + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(block_group); + } +#endif + + caching_ctl->progress = (u64)-1; + + up_read(&fs_info->commit_root_sem); + btrfs_free_excluded_extents(block_group); + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + + btrfs_put_caching_control(caching_ctl); + btrfs_put_block_group(block_group); +} + +int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) +{ + DEFINE_WAIT(wait); + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->start; + refcount_set(&caching_ctl->count, 1); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + btrfs_put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + mutex_lock(&caching_ctl->mutex); + ret = load_free_space_cache(cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; + } else { + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + } + spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->length - cache->used; + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(cache); + } +#endif + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + if (ret == 1) { + btrfs_put_caching_control(caching_ctl); + btrfs_free_excluded_extents(cache); + return 0; + } + } else { + /* + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + } + + if (load_cache_only) { + btrfs_put_caching_control(caching_ctl); + return 0; + } + + down_write(&fs_info->commit_root_sem); + refcount_inc(&caching_ctl->count); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->commit_root_sem); + + btrfs_get_block_group(cache); + + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); + + return ret; +} + +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + * + * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + bool found_raid56 = false; + bool found_raid1c34 = false; + + if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || + (flags & BTRFS_BLOCK_GROUP_RAID1C3) || + (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found_raid56 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found_raid56 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) + found_raid1c34 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) + found_raid1c34 = true; + up_read(&sinfo->groups_sem); + } + if (found_raid56) + btrfs_clear_fs_incompat(fs_info, RAID56); + if (found_raid1c34) + btrfs_clear_fs_incompat(fs_info, RAID1C34); + } +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_block_group *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_key key; + struct inode *inode; + struct kobject *kobj = NULL; + int ret; + int index; + int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; + bool remove_rsv = false; + + block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + trace_btrfs_remove_block_group(block_group); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + btrfs_free_excluded_extents(block_group); + btrfs_free_ref_tree_range(fs_info, block_group->start, + block_group->length); + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + factor = btrfs_bg_type_to_factor(block_group->flags); + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ + inode = lookup_free_space_inode(block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * Make sure our free space cache IO is done before removing the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(trans, block_group, path); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + remove_rsv = true; + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + + if (!IS_ERR(inode)) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + btrfs_add_delayed_iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.type = 0; + key.offset = block_group->start; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + + spin_lock(&fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + + if (fs_info->first_logical_byte == block_group->start) + fs_info->first_logical_byte = (u64)-1; + spin_unlock(&fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(fs_info, block_group->flags); + } + up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + + if (block_group->has_caching_ctl) + caching_ctl = btrfs_get_caching_control(block_group); + if (block_group->cached == BTRFS_CACHE_STARTED) + btrfs_wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); + spin_unlock(&trans->transaction->dirty_bgs_lock); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->length); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->length); + WARN_ON(block_group->space_info->disk_total + < block_group->length * factor); + } + block_group->space_info->total_bytes -= block_group->length; + block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->disk_total -= block_group->length * factor; + + spin_unlock(&block_group->space_info->lock); + + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + + mutex_lock(&fs_info->chunk_mutex); + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + spin_unlock(&block_group->lock); + + mutex_unlock(&fs_info->chunk_mutex); + + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } +out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_free_path(path); + return ret; +} + +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = em->map_lookup; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + +/* + * Mark block group @cache read-only, so later write won't happen to block + * group @cache. + * + * If @force is not set, this function will only mark the block group readonly + * if we have enough free space (1M) in other metadata/system block groups. + * If @force is not set, this function will mark the block group readonly + * without checking free space. + * + * NOTE: This function doesn't care if other block groups can contain all the + * data in this block group. That check should be done by relocation routine, + * not this function. + */ +static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + u64 sinfo_used; + u64 min_allocable_bytes; + int ret = -ENOSPC; + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = SZ_1M; + else + min_allocable_bytes = 0; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + + if (cache->ro) { + cache->ro++; + ret = 0; + goto out; + } + + num_bytes = cache->length - cache->reserved - cache->pinned - + cache->bytes_super - cache->used; + sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * sinfo_used + num_bytes should always <= sinfo->total_bytes. + * + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer (if min_allocable_bytes is not 0). + */ + if (sinfo_used + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + cache->ro++; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); + ret = 0; + } +out: + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", cache->start); + btrfs_info(cache->fs_info, + "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", + sinfo_used, num_bytes, min_allocable_bytes); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } + return ret; +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->pinned || + block_group->used || block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->start); + if (IS_ERR(trans)) { + btrfs_dec_block_group_ro(block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->start; + end = start + block_group->length - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -block_group->pinned, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(fs_info, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, block_group->start); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } +end_trans: + btrfs_end_transaction(trans); +next: + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_unused(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +static int find_first_block_group(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_block_group_item bg; + u64 flags; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else if (em->start != found_key.objectid || + em->len != found_key.offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + found_key.objectid, found_key.offset, + em->start, em->len); + ret = -EUCLEAN; + } else { + read_extent_buffer(leaf, &bg, + btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_stack_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & + BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + found_key.objectid, + found_key.offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & + em->map_lookup->type)); + ret = -EUCLEAN; + } else { + ret = 0; + } + } + free_extent_map(em); + goto out; + } + path->slots[0]++; + } +out: + return ret; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +static int exclude_super_stripes(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->start < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; + cache->bytes_super += stripe_len; + ret = btrfs_add_excluded_extent(fs_info, cache->start, + stripe_len); + if (ret) + return ret; + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(fs_info, cache->start, + bytenr, &logical, &nr, &stripe_len); + if (ret) + return ret; + + while (nr--) { + u64 start, len; + + if (logical[nr] > cache->start + cache->length) + continue; + + if (logical[nr] + stripe_len <= cache->start) + continue; + + start = logical[nr]; + if (start < cache->start) { + start = cache->start; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->start + cache->length - start); + } + + cache->bytes_super += len; + ret = btrfs_add_excluded_extent(fs_info, start, len); + if (ret) { + kfree(logical); + return ret; + } + } + + kfree(logical); + } + return 0; +} + +static void link_block_group(struct btrfs_block_group *cache) +{ + struct btrfs_space_info *space_info = cache->space_info; + int index = btrfs_bg_flags_to_raid_index(cache->flags); + bool first = false; + + down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) + btrfs_sysfs_add_block_group_type(cache); +} + +static struct btrfs_block_group *btrfs_create_block_group_cache( + struct btrfs_fs_info *fs_info, u64 start, u64 size) +{ + struct btrfs_block_group *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->start = start; + cache->length = size; + + cache->fs_info = fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); + set_free_space_tree_thresholds(cache); + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + + return cache; +} + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->start != em->start || bg->length != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->start, bg->length, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + +static int read_one_block_group(struct btrfs_fs_info *info, + struct btrfs_path *path, + const struct btrfs_key *key, + int need_clear) +{ + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_block_group *cache; + struct btrfs_space_info *space_info; + struct btrfs_block_group_item bgi; + const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); + int slot = path->slots[0]; + int ret; + + ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); + + cache = btrfs_create_block_group_cache(info, key->objectid, key->offset); + if (!cache) + return -ENOMEM; + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + cache->used = btrfs_stack_block_group_used(&bgi); + cache->flags = btrfs_stack_block_group_flags(&bgi); + if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->start); + ret = -EINVAL; + goto error; + } + + /* + * We need to exclude the super stripes now so that the space info has + * super bytes accounted for, otherwise we'll think we have more space + * than we actually do. + */ + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case. */ + btrfs_free_excluded_extents(cache); + goto error; + } + + /* + * Check for two cases, either we are full, and therefore don't need + * to bother with the caching work since we won't find any space, or we + * are empty, and we can just add all the space in and be done with it. + * This saves us _a_lot_ of time, particularly in the full case. + */ + if (key->offset == cache->used) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + btrfs_free_excluded_extents(cache); + } else if (cache->used == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, key->objectid, + key->objectid + key->offset); + btrfs_free_excluded_extents(cache); + } + + ret = btrfs_add_block_group_cache(info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + goto error; + } + trace_btrfs_add_block_group(info, cache, 0); + btrfs_update_space_info(info, cache->flags, key->offset, + cache->used, cache->bytes_super, &space_info); + + cache->space_info = space_info; + + link_block_group(cache); + + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_readonly(info, cache->start)) { + inc_block_group_ro(cache, 1); + } else if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + btrfs_mark_bg_unused(cache); + } + return 0; +error: + btrfs_put_block_group(cache); + return ret; +} + +int btrfs_read_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group *cache; + struct btrfs_space_info *space_info; + struct btrfs_key key; + int need_clear = 0; + u64 cache_gen; + + key.objectid = 0; + key.offset = 0; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_test_opt(info, SPACE_CACHE) && + btrfs_super_generation(info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(info, CLEAR_CACHE)) + need_clear = 1; + + while (1) { + ret = find_first_block_group(info, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = read_one_block_group(info, path, &key, need_clear); + if (ret < 0) + goto error; + key.objectid += key.offset; + key.offset = 0; + btrfs_release_path(path); + } + + list_for_each_entry_rcu(space_info, &info->space_info, list) { + if (!(btrfs_get_alloc_profile(info, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * Avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + inc_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + inc_block_group_ro(cache, 1); + } + + btrfs_init_global_block_rsv(info); + ret = check_chunk_block_group_mappings(info); +error: + btrfs_free_path(path); + return ret; +} + +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *block_group; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + if (!trans->can_flush_pending_bgs) + return; + + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group, + bg_list); + if (ret) + goto next; + + spin_lock(&block_group->lock); + btrfs_set_stack_block_group_used(&item, block_group->used); + btrfs_set_stack_block_group_chunk_objectid(&item, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&item, block_group->flags); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, ret); + ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, ret); + add_block_group_free_space(trans, block_group); + /* Already aborted the transaction if it failed. */ +next: + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } + btrfs_trans_release_chunk_metadata(trans); +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + int ret; + + btrfs_set_log_full_commit(trans); + + cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); + if (!cache) + return -ENOMEM; + + cache->used = bytes_used; + cache->flags = type; + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + return ret; + } + + add_new_free_space(cache, chunk_offset, chunk_offset + size); + + btrfs_free_excluded_extents(cache); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif + /* + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. + */ + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + ASSERT(cache->space_info); + + ret = btrfs_add_block_group_cache(fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ + trace_btrfs_add_block_group(fs_info, cache, 1); + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, + cache->bytes_super, &cache->space_info); + btrfs_update_global_block_rsv(fs_info); + + link_block_group(cache); + + list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); + + set_avail_alloc_bits(fs_info, type); + return 0; +} + +static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices; + u64 stripped; + + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); + + num_devices = fs_info->fs_devices->rw_devices; + + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; + + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* this is drive concat, leave it alone */ + } + + return flags; +} + +/* + * Mark one block group RO, can be called several times for the same block + * group. + * + * @cache: the destination block group + * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to + * ensure we still have some free space after marking this + * block group RO. + */ +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; + +again: + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; + + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + goto again; + } + + if (do_chunk_alloc) { + /* + * If we are changing raid levels, try to allocate a + * corresponding block group with the new raid level. + */ + alloc_flags = update_block_group_flags(fs_info, cache->flags); + if (alloc_flags != cache->flags) { + ret = btrfs_chunk_alloc(trans, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + } + + ret = inc_block_group_ro(cache, !do_chunk_alloc); + if (!do_chunk_alloc) + goto unlock_out; + if (!ret) + goto out; + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); +out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(fs_info, cache->flags); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, alloc_flags); + mutex_unlock(&fs_info->chunk_mutex); + } +unlock_out: + mutex_unlock(&fs_info->ro_block_group_mutex); + + btrfs_end_transaction(trans); + return ret; +} + +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + if (!--cache->ro) { + num_bytes = cache->length - cache->reserved - + cache->pinned - cache->bytes_super - cache->used; + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_root *extent_root = fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + struct btrfs_block_group_item bgi; + struct btrfs_key key; + + key.objectid = cache->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = cache->length; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&bgi, cache->flags); + write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_mark_buffer_dirty(leaf); +fail: + btrfs_release_path(path); + return ret; + +} + +static int cache_save_setup(struct btrfs_block_group *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + u64 num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->length < (100 * SZ_1M)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + + if (trans->aborted) + return 0; +again: + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, ret); + goto out_put; + } + WARN_ON(ret); + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(fs_info, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = div_u64(block_group->length, SZ_256M); + if (!num_pages) + num_pages = 1; + + num_pages *= 16; + num_pages *= PAGE_SIZE; + + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ + if (!ret) + dcs = BTRFS_DC_SETUP; + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + extent_changeset_free(data_reserved); + return ret; +} + +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * Transaction commit does final block group cache writeback during a critical + * section where nothing is allowed to change the FS. This is required in + * order for the cache to actually match the block group, but can introduce a + * lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. + * There's a chance we'll have to redo some of it if the block group changes + * again during the commit, but it greatly reduces the commit latency by + * getting rid of the easy block groups while we're still allowing others to + * join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* Make sure all the block groups on our dirty list actually exist */ + btrfs_create_pending_block_groups(trans); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + bool drop_reserve = true; + + cache = list_first_entry(&dirty, struct btrfs_block_group, + dirty_list); + /* + * This can happen if something re-dirties a block group that + * is already under IO. Just wait for it to finish and then do + * it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide if + * it should update the cache_state. Don't delete until after + * we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + drop_reserve = false; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, ret); + } + } + + /* If it's not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * Go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret < 0) { + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group, + dirty_list); + + /* + * This can happen if cache_save_setup re-dirties a block group + * that is already under IO. Just wait for it to finish and + * then do it all again + */ + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + /* + * Don't remove from the dirty list until after we've waited on + * any pending IO + */ + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, + (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, path, cache); + } + if (ret) + btrfs_abort_transaction(trans, ret); + } + + /* If its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_block_group *cache = NULL; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + int ret = 0; + + /* Block accounting for super block */ + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) { + ret = -ENOENT; + break; + } + factor = btrfs_bg_type_to_factor(cache->flags); + + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, 1); + + byte_in_group = bytenr - cache->start; + WARN_ON(byte_in_group > cache->length); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + old_val = cache->used; + num_bytes = min(total, cache->length - byte_in_group); + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->delayed_ref_updates++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) + btrfs_mark_bg_unused(cache); + + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; +} + +/** + * btrfs_add_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. + */ +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} + +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); +} + +static int should_alloc_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, int force) +{ + u64 bytes_used = btrfs_space_info_used(sinfo, false); + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(fs_info->super_copy); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + + if (sinfo->total_bytes - bytes_used < thresh) + return 1; + } + + if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + return 0; + return 1; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) +{ + u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); +} + +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *space_info; + bool wait_for_alloc = false; + bool should_alloc = false; + int ret = 0; + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + + do { + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + should_alloc = should_alloc_chunk(fs_info, space_info, force); + if (space_info->full) { + /* No more free physical space */ + if (should_alloc) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } else if (!should_alloc) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + /* + * Someone is already allocating, so we need to block + * until this someone is finished and then loop to + * recheck if we should continue with our allocation + * attempt. + */ + wait_for_alloc = true; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + /* Proceed with allocation */ + space_info->chunk_alloc = 1; + wait_for_alloc = false; + spin_unlock(&space_info->lock); + } + + cond_resched(); + } while (wait_for_alloc); + + mutex_lock(&fs_info->chunk_mutex); + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, flags); + + ret = btrfs_alloc_chunk(trans, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { + ret = 1; + space_info->max_extent_size = 0; + } + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) + btrfs_create_pending_block_groups(trans); + + return ret; +} + +static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +{ + u64 num_dev; + + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) + num_dev = fs_info->fs_devices->rw_devices; + + return num_dev; +} + +/* + * Reserve space in the system space for allocating or removing a chunk + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *info; + u64 left; + u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - btrfs_space_info_used(info, true); + spin_unlock(&info->lock); + + num_devs = get_profile_num_devs(fs_info, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + btrfs_dump_space_info(fs_info, info, 0, 0); + } + + if (left < thresh) { + u64 flags = btrfs_system_alloc_profile(fs_info); + + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(fs_info->chunk_root, + &fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; + } +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = btrfs_next_block_group(block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + last = block_group->start + block_group->length; + btrfs_put_block_group(block_group); + } +} + +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->commit_root_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } + up_write(&info->commit_root_sem); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + btrfs_free_excluded_extents(block_group); + + btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* + * Now that all the block groups are freed, go through and free all the + * space_info structs. This is only called during the final stages of + * unmount, and so we know nobody is using them. We call + * synchronize_rcu() once before we start, just to be on the safe side. + */ + synchronize_rcu(); + + btrfs_release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + list_del(&space_info->list); + btrfs_sysfs_remove_space_info(space_info); + } + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h new file mode 100644 index 000000000000..9b409676c4b2 --- /dev/null +++ b/fs/btrfs/block-group.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_GROUP_H +#define BTRFS_BLOCK_GROUP_H + +#include "free-space-cache.h" + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, +}; + +/* + * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to + * only allocate a chunk if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few + * chunks already allocated. This is used as part of the clustering code to + * help make sure we have a good pool of storage to cluster in, without filling + * the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_work work; + struct btrfs_block_group *block_group; + u64 progress; + refcount_t count; +}; + +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP SZ_2M + +struct btrfs_block_group { + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 start; + u64 length; + u64 pinned; + u64 reserved; + u64 used; + u64 delalloc_bytes; + u64 bytes_super; + u64 flags; + u64 cache_generation; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* For raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + + unsigned int ro; + unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; + + int disk_cache_state; + + /* Cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* Free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* Block group cache stuff */ + struct rb_node cache_node; + + /* For block groups in the same raid type */ + struct list_head list; + + /* Usage count */ + atomic_t count; + + /* + * List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; + + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; + + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; +}; + +#ifdef CONFIG_BTRFS_DEBUG +static inline int btrfs_should_fragment_free_space( + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + +struct btrfs_block_group *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache); +void btrfs_get_block_group(struct btrfs_block_group *cache); +void btrfs_put_block_group(struct btrfs_block_group *cache); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, + u64 num_bytes); +int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache); +int btrfs_cache_block_group(struct btrfs_block_group *cache, + int load_cache_only); +void btrfs_put_caching_control(struct btrfs_caching_control *ctl); +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group *cache); +u64 add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end); +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, + const u64 chunk_offset); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +int btrfs_read_block_groups(struct btrfs_fs_info *info); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc); +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc); +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, + u64 num_bytes, int delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); +void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +int btrfs_free_block_groups(struct btrfs_fs_info *info); + +static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + +static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; +} + +#endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 000000000000..d07bd41a7c1e --- /dev/null +++ b/fs/btrfs/block-rsv.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "block-rsv.h" +#include "space-info.h" +#include "transaction.h" + +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; + u64 ret; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) { + num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } + spin_unlock(&block_rsv->lock); + + ret = num_bytes; + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, + space_info, + num_bytes); + } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + bool update_size) +{ + int ret; + + ret = btrfs_block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + return 0; + } + + return ret; +} + +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = NULL; + + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) + target = global_rsv; + else if (block_rsv != global_rsv && !delayed_rsv->full) + target = delayed_rsv; + + if (target && block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) +{ + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + btrfs_block_rsv_add_bytes(dest, num_bytes, true); + return 0; +} + +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + unsigned min_items; + + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + + /* + * We at a minimum are going to modify the csum root, the tree root, and + * the extent root. + */ + min_items = 3; + + /* + * But we also want to reserve enough space so we can do the fallback + * global reserve for an unlink, which is an additional 5 items (see the + * comment in __unlink_start_trans for what we're modifying.) + * + * But we also need space for the delayed ref updates from the unlink, + * so its 10, 5 for the actual operation, and 5 for the delayed ref + * updates. + */ + min_items += 10; + + num_bytes = max_t(u64, num_bytes, + btrfs_calc_insert_metadata_size(fs_info, min_items)); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, SZ_512M); + + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); + } else if (block_rsv->reserved > block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); + block_rsv->reserved = block_rsv->size; + btrfs_try_granting_tickets(fs_info, sinfo); + } + + if (block_rsv->reserved == block_rsv->size) + block_rsv->full = 1; + else + block_rsv->full = 0; + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_update_global_block_rsv(fs_info); +} + +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == fs_info->csum_root && trans->adding_csums) || + (root == fs_info->uuid_root)) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &fs_info->empty_block_rsv; + + return block_rsv; +} + +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + btrfs_update_global_block_rsv(fs_info); + goto again; + } + + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 000000000000..d1428bb73fc5 --- /dev/null +++ b/fs/btrfs/block-rsv.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_RSV_H +#define BTRFS_BLOCK_RSV_H + +struct btrfs_trans_handle; +enum btrfs_reserve_flush_enum; + +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + unsigned short full; + unsigned short type; + unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; +}; + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size); +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release); +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize); + +static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + +static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u32 blocksize) +{ + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_release(fs_info, block_rsv, 0); +} + +#endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6f5d07415dab..4e12a477d32e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -63,9 +63,6 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; - /* held while doing delalloc reservations */ - struct mutex delalloc_mutex; - /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; @@ -148,12 +145,6 @@ struct btrfs_inode { u64 last_unlink_trans; /* - * Track the transaction id of the last transaction used to create a - * hard link for the inode. This is used by the log tree (fsync). - */ - u64 last_link_trans; - - /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -203,8 +194,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -extern unsigned char btrfs_filetype_table[]; - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -345,22 +334,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); } +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define CSUM_FMT "0x%*phN" +#define CSUM_FMT_VALUE(size, bytes) size, bytes + static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; + struct btrfs_super_block *sb = root->fs_info->super_copy; + const u16 csum_size = btrfs_super_csum_size(sb); /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, - "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); else btrfs_warn_rl(root->fs_info, - "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); } #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b0c8094528d1..0b52ab4cb964 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -83,7 +83,7 @@ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> -#include <linux/crc32c.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -940,7 +940,7 @@ static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) kfree(sf); } -static int btrfsic_process_metablock( +static noinline_for_stack int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, @@ -1706,13 +1706,14 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * Test whether the disk block contains a tree block (leaf or node) * (note that this test fails for the super block) */ -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - char **datav, unsigned int num_pages) +static noinline_for_stack int btrfsic_test_for_metadata( + struct btrfsic_state *state, + char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; unsigned int i; if (num_pages * PAGE_SIZE < state->metablock_size) @@ -1723,14 +1724,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crypto_shash_update(shash, data, sublen); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, h->csum, state->csum_size)) return 1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4f2a8ae0aa42..43e1660f450f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -17,6 +17,8 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <crypto/hash.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -27,6 +29,41 @@ #include "extent_io.h" #include "extent_map.h" +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) @@ -37,11 +74,93 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) case BTRFS_COMPRESS_ZSTD: case BTRFS_COMPRESS_NONE: return btrfs_compress_types[type]; + default: + break; } return NULL; } +bool btrfs_compress_is_valid_type(const char *str, size_t len) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { + size_t comp_len = strlen(btrfs_compress_types[i]); + + if (len < comp_len) + continue; + + if (!strncmp(btrfs_compress_types[i], str, comp_len)) + return true; + } + return false; +} + +static int compression_compress_pages(int type, struct list_head *ws, + struct address_space *mapping, u64 start, struct page **pages, + unsigned long *out_pages, unsigned long *total_in, + unsigned long *total_out) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + return zlib_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_LZO: + return lzo_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_ZSTD: + return zstd_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. As a sane fallback, return what the + * callers will understand as 'no compression happened'. + */ + return -E2BIG; + } +} + +static int compression_decompress_bio(int type, struct list_head *ws, + struct compressed_bio *cb) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); + case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static int compression_decompress(int type, struct list_head *ws, + unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -57,32 +176,37 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret; struct page *page; unsigned long i; char *kaddr; - u32 csum; - u32 *cb_sum = &cb->sums; + u8 csum[BTRFS_CSUM_SIZE]; + u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) return 0; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < cb->nr_pages; i++) { page = cb->compressed_pages[i]; - csum = ~(u32)0; + crypto_shash_init(shash); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (u8 *)&csum); + crypto_shash_update(shash, kaddr, PAGE_SIZE); kunmap_atomic(kaddr); + crypto_shash_final(shash, (u8 *)&csum); - if (csum != *cb_sum) { - btrfs_print_data_csum_error(inode, disk_start, csum, - *cb_sum, cb->mirror_num); + if (memcmp(&csum, cb_sum, csum_size)) { + btrfs_print_data_csum_error(inode, disk_start, + csum, cb_sum, cb->mirror_num); ret = -EIO; goto fail; } - cb_sum++; + cb_sum += csum_size; } ret = 0; @@ -160,7 +284,6 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int i; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -169,7 +292,7 @@ csum_failed: * checked so the end_io handlers know about it */ ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all) + bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) SetPageChecked(bvec->bv_page); bio_endio(cb->orig_bio); @@ -251,7 +374,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->compressed_pages[0]->mapping = cb->inode->i_mapping; btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, - bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); + bio->bi_status == BLK_STS_OK); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -289,7 +412,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages, - unsigned int write_flags) + unsigned int write_flags, + struct cgroup_subsys_state *blkcg_css) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; @@ -298,7 +422,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, int pg_index = 0; struct page *page; u64 first_byte = disk_start; - struct block_device *bdev; blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -317,12 +440,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bdev = fs_info->fs_devices->latest_bdev; - - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + + if (blkcg_css) { + bio->bi_opf |= REQ_CGROUP_PUNT; + kthread_associate_blkcg(blkcg_css); + } refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ @@ -355,16 +481,18 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0); if (ret) { bio->bi_status = ret; bio_endio(bio); } - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -385,12 +513,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0); if (ret) { bio->bi_status = ret; bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } @@ -529,7 +660,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned long nr_pages; unsigned long pg_index; struct page *page; - struct block_device *bdev; struct bio *comp_bio; u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; @@ -537,7 +667,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - u32 *sums; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -559,7 +690,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; - sums = &cb->sums; + sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -579,8 +710,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb->compressed_pages) goto fail1; - bdev = fs_info->fs_devices->latest_bdev; - for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); @@ -598,7 +727,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -618,6 +747,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + unsigned int nr_sectors; + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -635,16 +766,18 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); + nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + fs_info->sectorsize); + sums += csum_size * nr_sectors; + + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { comp_bio->bi_status = ret; bio_endio(comp_bio); } - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -662,7 +795,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { comp_bio->bi_status = ret; bio_endio(comp_bio); @@ -733,26 +866,6 @@ struct heuristic_ws { static struct workspace_manager heuristic_wsm; -static void heuristic_init_workspace_manager(void) -{ - btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress); -} - -static void heuristic_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&heuristic_wsm); -} - -static struct list_head *heuristic_get_workspace(unsigned int level) -{ - return btrfs_get_workspace(&heuristic_wsm, level); -} - -static void heuristic_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&heuristic_wsm, ws); -} - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -793,12 +906,7 @@ fail: } const struct btrfs_compress_op btrfs_heuristic_compress = { - .init_workspace_manager = heuristic_init_workspace_manager, - .cleanup_workspace_manager = heuristic_cleanup_workspace_manager, - .get_workspace = heuristic_get_workspace, - .put_workspace = heuristic_put_workspace, - .alloc_workspace = alloc_heuristic_ws, - .free_workspace = free_heuristic_ws, + .workspace_manager = &heuristic_wsm, }; static const struct btrfs_compress_op * const btrfs_compress_op[] = { @@ -809,13 +917,44 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -void btrfs_init_workspace_manager(struct workspace_manager *wsm, - const struct btrfs_compress_op *ops) +static struct list_head *alloc_workspace(int type, unsigned int level) { - struct list_head *workspace; + switch (type) { + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static void free_workspace(int type, struct list_head *ws) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); + case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); + case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); + case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} - wsm->ops = ops; +static void btrfs_init_workspace_manager(int type) +{ + struct workspace_manager *wsm; + struct list_head *workspace; + wsm = btrfs_compress_op[type]->workspace_manager; INIT_LIST_HEAD(&wsm->idle_ws); spin_lock_init(&wsm->ws_lock); atomic_set(&wsm->total_ws, 0); @@ -825,7 +964,7 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm, * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = wsm->ops->alloc_workspace(0); + workspace = alloc_workspace(type, 0); if (IS_ERR(workspace)) { pr_warn( "BTRFS: cannot preallocate compression workspace, will try later\n"); @@ -836,14 +975,16 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm, } } -void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) +static void btrfs_cleanup_workspace_manager(int type) { + struct workspace_manager *wsman; struct list_head *ws; + wsman = btrfs_compress_op[type]->workspace_manager; while (!list_empty(&wsman->idle_ws)) { ws = wsman->idle_ws.next; list_del(ws); - wsman->ops->free_workspace(ws); + free_workspace(type, ws); atomic_dec(&wsman->total_ws); } } @@ -854,9 +995,9 @@ void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, - unsigned int level) +struct list_head *btrfs_get_workspace(int type, unsigned int level) { + struct workspace_manager *wsm; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -866,6 +1007,7 @@ struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, wait_queue_head_t *ws_wait; int *free_ws; + wsm = btrfs_compress_op[type]->workspace_manager; idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -901,7 +1043,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = wsm->ops->alloc_workspace(level); + workspace = alloc_workspace(type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -934,21 +1076,34 @@ again: static struct list_head *get_workspace(int type, int level) { - return btrfs_compress_op[type]->get_workspace(level); + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } } /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) +void btrfs_put_workspace(int type, struct list_head *ws) { + struct workspace_manager *wsm; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; + wsm = btrfs_compress_op[type]->workspace_manager; idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -964,7 +1119,7 @@ void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) } spin_unlock(ws_lock); - wsm->ops->free_workspace(ws); + free_workspace(type, ws); atomic_dec(total_ws); wake: cond_wake_up(ws_wait); @@ -972,7 +1127,18 @@ wake: static void put_workspace(int type, struct list_head *ws) { - return btrfs_compress_op[type]->put_workspace(ws); + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } } /* @@ -1009,11 +1175,10 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, struct list_head *workspace; int ret; + level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, - start, pages, - out_pages, - total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, pages, + out_pages, total_in, total_out); put_workspace(type, workspace); return ret; } @@ -1039,7 +1204,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int type = cb->compress_type; workspace = get_workspace(type, 0); - ret = btrfs_compress_op[type]->decompress_bio(workspace, cb); + ret = compression_decompress_bio(type, workspace, cb); put_workspace(type, workspace); return ret; @@ -1057,9 +1222,8 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = get_workspace(type, 0); - ret = btrfs_compress_op[type]->decompress(workspace, data_in, - dest_page, start_byte, - srclen, destlen); + ret = compression_decompress(type, workspace, data_in, dest_page, + start_byte, srclen, destlen); put_workspace(type, workspace); return ret; @@ -1067,18 +1231,18 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, void __init btrfs_init_compress(void) { - int i; - - for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) - btrfs_compress_op[i]->init_workspace_manager(); + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_init_workspace_manager(); } void __cold btrfs_exit_compress(void) { - int i; - - for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) - btrfs_compress_op[i]->cleanup_workspace_manager(); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_cleanup_workspace_manager(); } /* @@ -1580,7 +1744,23 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str) level = 0; } - level = btrfs_compress_op[type]->set_level(level); + level = btrfs_compress_set_level(type, level); + + return level; +} + +/* + * Adjust @level according to the limits of the compression algorithm or + * fallback to default + */ +unsigned int btrfs_compress_set_level(int type, unsigned level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + if (level == 0) + level = ops->default_level; + else + level = min(level, ops->max_level); return level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9976fe0f7526..d253f7aa8ed5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -61,7 +61,7 @@ struct compressed_bio { * the start of a variable length array of checksums only * used by reads */ - u32 sums; + u8 sums[]; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -93,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages, - unsigned int write_flags); + unsigned int write_flags, + struct cgroup_subsys_state *blkcg_css); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); @@ -104,11 +105,10 @@ enum btrfs_compression_type { BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, - BTRFS_COMPRESS_TYPES = 3, + BTRFS_NR_COMPRESS_TYPES = 4, }; struct workspace_manager { - const struct btrfs_compress_op *ops; struct list_head idle_ws; spinlock_t ws_lock; /* Number of free workspaces */ @@ -119,53 +119,18 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -void btrfs_init_workspace_manager(struct workspace_manager *wsm, - const struct btrfs_compress_op *ops); -struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, - unsigned int level); -void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws); -void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm); +struct list_head *btrfs_get_workspace(int type, unsigned int level); +void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { - void (*init_workspace_manager)(void); - - void (*cleanup_workspace_manager)(void); - - struct list_head *(*get_workspace)(unsigned int level); - - void (*put_workspace)(struct list_head *ws); - - struct list_head *(*alloc_workspace)(unsigned int level); - - void (*free_workspace)(struct list_head *workspace); - - int (*compress_pages)(struct list_head *workspace, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out); - - int (*decompress_bio)(struct list_head *workspace, - struct compressed_bio *cb); - - int (*decompress)(struct list_head *workspace, - unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); - - /* - * This bounds the level set by the user to be within range of a - * particular compression type. It returns the level that will be used - * if the level is out of bounds or the default if 0 is passed in. - */ - unsigned int (*set_level)(unsigned int level); + struct workspace_manager *workspace_manager; + /* Maximum level supported by the compression algorithm */ + unsigned int max_level; + unsigned int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ -#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1) +#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES extern const struct btrfs_compress_op btrfs_heuristic_compress; extern const struct btrfs_compress_op btrfs_zlib_compress; @@ -173,6 +138,9 @@ extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); +bool btrfs_compress_is_valid_type(const char *str, size_t len); + +unsigned int btrfs_compress_set_level(int type, unsigned level); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 324df36d28bf..24658b5a5787 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,44 +21,60 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend); static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, int empty); static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -struct btrfs_path *btrfs_alloc_path(void) +static const struct btrfs_csums { + u16 size; + const char *name; + const char *driver; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +int btrfs_super_csum_size(const struct btrfs_super_block *s) { - return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ + return btrfs_csums[t].size; +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].name; } /* - * set all locked nodes in the path to blocking locks. This should - * be done before scheduling + * Return driver name if defined, otherwise the name that's also a valid driver + * name */ -noinline void btrfs_set_path_blocking(struct btrfs_path *p) +const char *btrfs_super_csum_driver(u16 csum_type) { - int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - /* - * If we currently have a spinning reader or writer lock this - * will bump the count of blocking holders and drop the - * spinlock. - */ - if (p->locks[i] == BTRFS_READ_LOCK) { - btrfs_set_lock_blocking_read(p->nodes[i]); - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - } else if (p->locks[i] == BTRFS_WRITE_LOCK) { - btrfs_set_lock_blocking_write(p->nodes[i]); - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } - } + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver ?: + btrfs_csums[csum_type].name; +} + +size_t __const btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } /* this also releases the path */ @@ -363,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); @@ -378,8 +394,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * The 'start address' is the logical address of the *new* root node * for root replace operations, or the logical address of the affected * block for all other operations. - * - * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -389,6 +403,8 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node *parent = NULL; struct tree_mod_elem *cur; + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; @@ -726,11 +742,11 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline int -tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { + struct btrfs_fs_info *fs_info = dst->fs_info; int ret = 0; struct tree_mod_elem **tm_list = NULL; struct tree_mod_elem **tm_list_add, **tm_list_rem; @@ -950,7 +966,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, fs_info, + ret = btrfs_set_disk_extent_flags(trans, buf->start, buf->len, new_flags, level, 0); @@ -970,7 +986,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - clean_tree_block(fs_info, buf); + btrfs_clean_tree_block(buf); *last_ref = 1; } return 0; @@ -1105,7 +1121,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - extent_buffer_get(cow); + atomic_inc(&cow->refs); ret = tree_mod_log_insert_root(root->node, cow, 1); BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); @@ -1345,6 +1361,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb = NULL; struct extent_buffer *eb_root; + u64 eb_root_owner = 0; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; @@ -1382,6 +1399,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); @@ -1398,7 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_owner(eb, eb_root_owner); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } @@ -1541,7 +1559,7 @@ static int comp_keys(const struct btrfs_disk_key *disk, /* * same as comp_keys only with two btrfs_key's */ -int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -1792,9 +1810,8 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ -static noinline struct extent_buffer * -read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, - int slot) +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot) { int level = btrfs_header_level(parent); struct extent_buffer *eb; @@ -1806,7 +1823,7 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, BUG_ON(level == 0); btrfs_node_key_to_cpu(parent, &first_key, slot); - eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { @@ -1863,7 +1880,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; /* promote the child to a root */ - child = read_node_slot(fs_info, mid, 0); + child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); @@ -1888,7 +1905,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(fs_info, mid); + btrfs_clean_tree_block(mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1903,7 +1920,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(fs_info, parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -1918,7 +1935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = read_node_slot(fs_info, parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -1936,7 +1953,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* first, try to make some room in the middle buffer */ if (left) { orig_slot += btrfs_header_nritems(left); - wret = push_node_left(trans, fs_info, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } @@ -1945,11 +1962,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * then try to empty the right most buffer into the middle */ if (right) { - wret = push_node_left(trans, fs_info, mid, right, 1); + wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(fs_info, right); + btrfs_clean_tree_block(right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1981,20 +1998,20 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(fs_info, ret, NULL); goto enospc; } - wret = balance_node_right(trans, fs_info, mid, left); + wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; goto enospc; } if (wret == 1) { - wret = push_node_left(trans, fs_info, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(fs_info, mid); + btrfs_clean_tree_block(mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2015,7 +2032,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - extent_buffer_get(left); + atomic_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -2078,7 +2095,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(fs_info, parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -2098,8 +2115,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = push_node_left(trans, fs_info, - left, mid, 0); + wret = push_node_left(trans, left, mid, 0); } } if (wret < 0) @@ -2131,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(fs_info, parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2154,8 +2170,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = balance_node_right(trans, fs_info, - right, mid); + wret = balance_node_right(trans, right, mid); } } if (wret < 0) @@ -2360,32 +2375,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * This releases any locks held in the path starting at level and - * going all the way up to the root. - * - * btrfs_search_slot will keep the lock held on higher nodes in a few - * corner cases, such as COW of the block at slot zero in the node. This - * ignores those rules, and it should only be called when there are no - * more updates to be done higher up in the tree. - */ -noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) -{ - int i; - - if (path->keep_locks) - return; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - if (!path->nodes[i]) - continue; - if (!path->locks[i]) - continue; - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } -} - -/* * helper function for btrfs_search_slot. The goal is to find a block * in cache without setting the path to blocking. If we find the block * we return zero and the path is unchanged. @@ -2416,6 +2405,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (btrfs_verify_level_key(tmp, + parent_level - 1, &first_key, gen)) { + free_extent_buffer(tmp); + return -EUCLEAN; + } *eb_ret = tmp; return 0; } @@ -2623,7 +2622,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, } else { b = root->commit_root; - extent_buffer_get(b); + atomic_inc(&b->refs); } level = btrfs_header_level(b); /* @@ -2706,7 +2705,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -2757,12 +2755,10 @@ again: } while (b) { + int dec = 0; + level = btrfs_header_level(b); - /* - * setup the path here so we can release it under lock - * contention with the cow code - */ if (cow) { bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); @@ -2833,78 +2829,10 @@ cow_done: if (ret < 0) goto done; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } - p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, - ins_len, &write_lock_level); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - - /* - * slot 0 is special, if we change the key - * we have to update the parent pointer - * which means we must have a write lock - * on the parent - */ - if (slot == 0 && ins_len && - write_lock_level < level + 1) { - write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); - - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } - - err = read_block_for_search(root, p, &b, level, - slot, key); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - - if (!p->skip_locking) { - level = btrfs_header_level(b); - if (level <= write_lock_level) { - err = btrfs_try_tree_write_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - } - p->locks[level] = BTRFS_WRITE_LOCK; - } else { - err = btrfs_tree_read_lock_atomic(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } - p->locks[level] = BTRFS_READ_LOCK; - } - p->nodes[level] = b; - } - } else { + if (level == 0) { p->slots[level] = slot; if (ins_len > 0 && - btrfs_leaf_free_space(fs_info, b) < ins_len) { + btrfs_leaf_free_space(b) < ins_len) { if (write_lock_level < 1) { write_lock_level = 1; btrfs_release_path(p); @@ -2926,6 +2854,67 @@ cow_done: min_write_lock_level, NULL); goto done; } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + /* + * Slot 0 is special, if we change the key we have to update + * the parent pointer which means we must have a write lock on + * the parent + */ + if (slot == 0 && ins_len && write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + unlock_up(p, level, lowest_unlock, min_write_lock_level, + &write_lock_level); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + level = btrfs_header_level(b); + if (level <= write_lock_level) { + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + if (!btrfs_tree_read_lock_atomic(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + } + p->locks[level] = BTRFS_READ_LOCK; + } + p->nodes[level] = b; + } } ret = 1; done: @@ -2982,6 +2971,8 @@ again: p->locks[level] = BTRFS_READ_LOCK; while (b) { + int dec = 0; + level = btrfs_header_level(b); p->nodes[level] = b; @@ -3002,48 +2993,45 @@ again: if (ret < 0) goto done; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } + if (level == 0) { p->slots[level] = slot; unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); - err = read_block_for_search(root, p, &b, level, - slot, key); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } - level = btrfs_header_level(b); - err = btrfs_tree_read_lock_atomic(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } - b = tree_mod_log_rewind(fs_info, p, b, time_seq); - if (!b) { - ret = -ENOMEM; - goto done; - } - p->locks[level] = BTRFS_READ_LOCK; - p->nodes[level] = b; - } else { - p->slots[level] = slot; - unlock_up(p, level, lowest_unlock, 0, NULL); + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + if (!btrfs_tree_read_lock_atomic(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + } + b = tree_mod_log_rewind(fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; goto done; } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; } ret = 1; done: @@ -3181,11 +3169,31 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - BUG_ON(comp_keys(&disk_key, new_key) >= 0); + if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + btrfs_print_leaf(eb); + BUG(); + } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - BUG_ON(comp_keys(&disk_key, new_key) <= 0); + if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + btrfs_print_leaf(eb); + BUG(); + } } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3203,10 +3211,10 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, int empty) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int src_nritems; int dst_nritems; @@ -3239,8 +3247,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3278,10 +3285,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, * this will only push up to 1/2 the contents of the left node over */ static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int max_push; int src_nritems; @@ -3315,8 +3322,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3389,7 +3396,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, free_extent_buffer(old); add_root_to_dirty_list(root); - extent_buffer_get(c); + atomic_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; @@ -3404,7 +3411,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { @@ -3417,7 +3424,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { ret = tree_mod_log_insert_move(lower, slot + 1, slot, @@ -3501,7 +3508,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); ASSERT(btrfs_header_level(c) == level); - ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3517,7 +3524,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, fs_info, path, &disk_key, split->start, + insert_ptr(trans, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { @@ -3549,7 +3556,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) if (!nr) return 0; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, l); start_item = btrfs_item_nr(start); end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + @@ -3565,9 +3572,9 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); int ret; @@ -3586,13 +3593,13 @@ noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) { + struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; struct btrfs_map_token token; @@ -3607,8 +3614,6 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, u32 data_end; u32 this_item_size; - btrfs_init_map_token(&token); - if (empty) nr = 0; else @@ -3626,7 +3631,8 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (path->slots[0] > i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(fs_info, left); + int space = btrfs_leaf_free_space(left); + if (space + push_space * 2 > free_space) break; } @@ -3655,10 +3661,10 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, right_nritems = btrfs_header_nritems(right); push_space = btrfs_item_end_nr(left, left_nritems - push_items); - push_space -= leaf_data_end(fs_info, left); + push_space -= leaf_data_end(left); /* make room in the right data area */ - data_end = leaf_data_end(fs_info, right); + data_end = leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + data_end - push_space, BTRFS_LEAF_DATA_OFFSET + data_end, @@ -3667,7 +3673,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, /* copy from the left data area */ copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3680,6 +3686,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, push_items * sizeof(struct btrfs_item)); /* update the item pointers */ + btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -3695,7 +3702,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(fs_info, left); + btrfs_clean_tree_block(left); btrfs_mark_buffer_dirty(right); @@ -3707,7 +3714,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(fs_info, path->nodes[0]); + btrfs_clean_tree_block(path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3739,7 +3746,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root int min_data_size, int data_size, int empty, u32 min_slot) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; struct extent_buffer *upper; @@ -3758,7 +3764,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - right = read_node_slot(fs_info, upper, slot + 1); + right = btrfs_read_node_slot(upper, slot + 1); /* * slot + 1 is not valid or we fail to read the right node, * no big deal, just return. @@ -3769,7 +3775,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(right); btrfs_set_lock_blocking_write(right); - free_space = btrfs_leaf_free_space(fs_info, right); + free_space = btrfs_leaf_free_space(right); if (free_space < data_size) goto out_unlock; @@ -3779,7 +3785,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(fs_info, right); + free_space = btrfs_leaf_free_space(right); if (free_space < data_size) goto out_unlock; @@ -3800,7 +3806,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(fs_info, path, min_data_size, empty, + return __push_leaf_right(path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); @@ -3816,12 +3822,12 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { + struct btrfs_fs_info *fs_info = left->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; int i; @@ -3835,8 +3841,6 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, u32 old_left_item_size; struct btrfs_map_token token; - btrfs_init_map_token(&token); - if (empty) nr = min(right_nritems, max_slot); else @@ -3849,7 +3853,8 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (path->slots[0] < i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(fs_info, right); + int space = btrfs_leaf_free_space(right); + if (space + push_space * 2 > free_space) break; } @@ -3882,13 +3887,14 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, btrfs_item_offset_nr(right, push_items - 1); copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, left) - push_space, + leaf_data_end(left) - push_space, BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); + btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; @@ -3909,17 +3915,19 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - - leaf_data_end(fs_info, right); + leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, right), push_space); + leaf_data_end(right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), btrfs_item_nr_offset(push_items), (btrfs_header_nritems(right) - push_items) * sizeof(struct btrfs_item)); } + + btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -3935,7 +3943,7 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(fs_info, right); + btrfs_clean_tree_block(right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3972,7 +3980,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, int empty, u32 max_slot) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; int slot; @@ -3992,7 +3999,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - left = read_node_slot(fs_info, path->nodes[1], slot - 1); + left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* * slot - 1 is not valid or we fail to read the left node, * no big deal, just return. @@ -4003,7 +4010,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(left); btrfs_set_lock_blocking_write(left); - free_space = btrfs_leaf_free_space(fs_info, left); + free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { ret = 1; goto out; @@ -4019,13 +4026,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(fs_info, left); + free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { ret = 1; goto out; } - return __push_leaf_left(fs_info, path, min_data_size, + return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); out: @@ -4039,23 +4046,21 @@ out: * available for the resulting leaf level of the path. */ static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct extent_buffer *l, struct extent_buffer *right, int slot, int mid, int nritems) { + struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; struct btrfs_disk_key disk_key; struct btrfs_map_token token; - btrfs_init_map_token(&token); - nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), @@ -4064,10 +4069,11 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, copy_extent_buffer(right, l, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, l), data_copy_size); + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; @@ -4079,8 +4085,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, fs_info, path, &disk_key, right->start, - path->slots[1] + 1, 1); + insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -4115,7 +4120,6 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; int progress = 0; int slot; @@ -4124,7 +4128,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot < btrfs_header_nritems(path->nodes[0])) - space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); + space_needed -= btrfs_leaf_free_space(path->nodes[0]); /* * try to push all the items after our slot into the @@ -4145,14 +4149,14 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, if (path->slots[0] == 0 || path->slots[0] == nritems) return 0; - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; space_needed = data_size; if (slot > 0) - space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); + space_needed -= btrfs_leaf_free_space(path->nodes[0]); ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -4201,7 +4205,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int space_needed = data_size; if (slot < btrfs_header_nritems(l)) - space_needed -= btrfs_leaf_free_space(fs_info, l); + space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_right(trans, root, path, space_needed, space_needed, 0, 0); @@ -4210,8 +4214,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, if (wret) { space_needed = data_size; if (slot > 0) - space_needed -= btrfs_leaf_free_space(fs_info, - l); + space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_left(trans, root, path, space_needed, space_needed, 0, (u32)-1); if (wret < 0) @@ -4220,7 +4223,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; /* did the pushes work? */ - if (btrfs_leaf_free_space(fs_info, l) >= data_size) + if (btrfs_leaf_free_space(l) >= data_size) return 0; } @@ -4288,7 +4291,7 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, fs_info, path, &disk_key, + insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -4297,7 +4300,7 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, fs_info, path, &disk_key, + insert_ptr(trans, path, &disk_key, right->start, path->slots[1], 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -4314,7 +4317,7 @@ again: return ret; } - copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems); + copy_for_split(trans, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -4327,7 +4330,7 @@ again: push_for_double: push_for_double_split(trans, root, path, data_size); tried_avoid_double = 1; - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; goto again; } @@ -4336,7 +4339,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int ins_len) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -4350,7 +4352,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); - if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len) + if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -4377,7 +4379,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; /* the leaf has changed, it now has room. return now */ - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len) + if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) goto err; if (key.type == BTRFS_EXTENT_DATA_KEY) { @@ -4400,8 +4402,7 @@ err: return ret; } -static noinline int split_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static noinline int split_item(struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset) { @@ -4416,7 +4417,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item)); + BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); btrfs_set_path_blocking(path); @@ -4465,7 +4466,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info, item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0); + BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); return 0; } @@ -4497,7 +4498,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(root->fs_info, path, new_key, split_offset); + ret = split_item(path, new_key, split_offset); return ret; } @@ -4543,8 +4544,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4557,8 +4557,6 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; @@ -4567,7 +4565,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, return; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); old_data_start = btrfs_item_offset_nr(leaf, slot); @@ -4580,6 +4578,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4633,7 +4632,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4642,8 +4641,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4655,14 +4653,12 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { + if (btrfs_leaf_free_space(leaf) < data_size) { btrfs_print_leaf(leaf); BUG(); } @@ -4672,15 +4668,16 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d too large, nritems %d", + btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); - BUG_ON(1); + BUG(); } /* * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4701,7 +4698,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4732,21 +4729,20 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_unlock_up_safe(path, 1); - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { + if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", - total_size, btrfs_leaf_free_space(fs_info, leaf)); + total_size, btrfs_leaf_free_space(leaf)); BUG(); } + btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -4754,7 +4750,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_print_leaf(leaf); btrfs_crit(fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); - BUG_ON(1); + BUG(); } /* * item0..itemN ... dataN.offset..dataN.size .. data0.size @@ -4794,7 +4790,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4933,7 +4929,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - extent_buffer_get(leaf); + atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, root, leaf, 0, 1); free_extent_buffer_stale(leaf); } @@ -4953,9 +4949,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -4966,13 +4959,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(fs_info, leaf); + int data_end = leaf_data_end(leaf); + struct btrfs_map_token token; memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); + btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; @@ -4996,7 +4991,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(fs_info, leaf); + btrfs_clean_tree_block(leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -5015,7 +5010,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to del_ptr below */ slot = path->slots[1]; - extent_buffer_get(leaf); + atomic_inc(&leaf->refs); btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1, @@ -5126,7 +5121,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; struct btrfs_key found_key; int slot; @@ -5207,7 +5201,7 @@ find_next_key: goto out; } btrfs_set_path_blocking(path); - cur = read_node_slot(fs_info, cur, slot); + cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); goto out; @@ -5229,371 +5223,6 @@ out: return ret; } -static int tree_move_down(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - int *level) -{ - struct extent_buffer *eb; - - BUG_ON(*level == 0); - eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]); - if (IS_ERR(eb)) - return PTR_ERR(eb); - - path->nodes[*level - 1] = eb; - path->slots[*level - 1] = 0; - (*level)--; - return 0; -} - -static int tree_move_next_or_upnext(struct btrfs_path *path, - int *level, int root_level) -{ - int ret = 0; - int nritems; - nritems = btrfs_header_nritems(path->nodes[*level]); - - path->slots[*level]++; - - while (path->slots[*level] >= nritems) { - if (*level == root_level) - return -1; - - /* move upnext */ - path->slots[*level] = 0; - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - (*level)++; - path->slots[*level]++; - - nritems = btrfs_header_nritems(path->nodes[*level]); - ret = 1; - } - return ret; -} - -/* - * Returns 1 if it had to move up and next. 0 is returned if it moved only next - * or down. - */ -static int tree_advance(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - int *level, int root_level, - int allow_down, - struct btrfs_key *key) -{ - int ret; - - if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(path, level, root_level); - } else { - ret = tree_move_down(fs_info, path, level); - } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } - return ret; -} - -static int tree_compare_item(struct btrfs_path *left_path, - struct btrfs_path *right_path, - char *tmp_buf) -{ - int cmp; - int len1, len2; - unsigned long off1, off2; - - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); - if (len1 != len2) - return 1; - - off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); - off2 = btrfs_item_ptr_offset(right_path->nodes[0], - right_path->slots[0]); - - read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); - - cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); - if (cmp) - return 1; - return 0; -} - -#define ADVANCE 1 -#define ADVANCE_ONLY_NEXT -1 - -/* - * This function compares two trees and calls the provided callback for - * every changed/new/deleted item it finds. - * If shared tree blocks are encountered, whole subtrees are skipped, making - * the compare pretty fast on snapshotted subvolumes. - * - * This currently works on commit roots only. As commit roots are read only, - * we don't do any locking. The commit roots are protected with transactions. - * Transactions are ended and rejoined when a commit is tried in between. - * - * This function checks for modifications done to the trees while comparing. - * If it detects a change, it aborts immediately. - */ -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) -{ - struct btrfs_fs_info *fs_info = left_root->fs_info; - int ret; - int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; - struct btrfs_key left_key; - struct btrfs_key right_key; - char *tmp_buf = NULL; - int left_root_level; - int right_root_level; - int left_level; - int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; - u64 left_blockptr; - u64 right_blockptr; - u64 left_gen; - u64 right_gen; - - left_path = btrfs_alloc_path(); - if (!left_path) { - ret = -ENOMEM; - goto out; - } - right_path = btrfs_alloc_path(); - if (!right_path) { - ret = -ENOMEM; - goto out; - } - - tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } - - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; - - /* - * Strategy: Go to the first items of both trees. Then do - * - * If both trees are at level 0 - * Compare keys of current items - * If left < right treat left item as new, advance left tree - * and repeat - * If left > right treat right item as deleted, advance right tree - * and repeat - * If left == right do deep compare of items, treat as changed if - * needed, advance both trees and repeat - * If both trees are at the same level but not at level 0 - * Compare keys of current nodes/leafs - * If left < right advance left tree and repeat - * If left > right advance right tree and repeat - * If left == right compare blockptrs of the next nodes/leafs - * If they match advance both trees but stay at the same level - * and repeat - * If they don't match advance both trees while allowing to go - * deeper and repeat - * If tree levels are different - * Advance the tree that needs it and repeat - * - * Advancing a tree means: - * If we are at level 0, try to go to the next slot. If that's not - * possible, go one level up and repeat. Stop when we found a level - * where we could go to the next slot. We may at this point be on a - * node or a leaf. - * - * If we are not at level 0 and not on shared tree blocks, go one - * level deeper. - * - * If we are not at level 0 and on shared tree blocks, go one slot to - * the right if possible or go up and right. - */ - - down_read(&fs_info->commit_root_sem); - left_level = btrfs_header_level(left_root->commit_root); - left_root_level = left_level; - left_path->nodes[left_level] = - btrfs_clone_extent_buffer(left_root->commit_root); - if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - - right_level = btrfs_header_level(right_root->commit_root); - right_root_level = right_level; - right_path->nodes[right_level] = - btrfs_clone_extent_buffer(right_root->commit_root); - if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - up_read(&fs_info->commit_root_sem); - - if (left_level == 0) - btrfs_item_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - else - btrfs_node_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - if (right_level == 0) - btrfs_item_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - else - btrfs_node_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; - - while (1) { - if (advance_left && !left_end_reached) { - ret = tree_advance(fs_info, left_path, &left_level, - left_root_level, - advance_left != ADVANCE_ONLY_NEXT, - &left_key); - if (ret == -1) - left_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_left = 0; - } - if (advance_right && !right_end_reached) { - ret = tree_advance(fs_info, right_path, &right_level, - right_root_level, - advance_right != ADVANCE_ONLY_NEXT, - &right_key); - if (ret == -1) - right_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_right = 0; - } - - if (left_end_reached && right_end_reached) { - ret = 0; - goto out; - } else if (left_end_reached) { - if (right_level == 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - } - advance_right = ADVANCE; - continue; - } else if (right_end_reached) { - if (left_level == 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - } - advance_left = ADVANCE; - continue; - } - - if (left_level == 0 && right_level == 0) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - } else if (cmp > 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - advance_right = ADVANCE; - } else { - enum btrfs_compare_tree_result result; - - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = tree_compare_item(left_path, right_path, - tmp_buf); - if (ret) - result = BTRFS_COMPARE_TREE_CHANGED; - else - result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_path, right_path, - &left_key, result, ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } else if (left_level == right_level) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - advance_left = ADVANCE; - } else if (cmp > 0) { - advance_right = ADVANCE; - } else { - left_blockptr = btrfs_node_blockptr( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_blockptr = btrfs_node_blockptr( - right_path->nodes[right_level], - right_path->slots[right_level]); - left_gen = btrfs_node_ptr_generation( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_gen = btrfs_node_ptr_generation( - right_path->nodes[right_level], - right_path->slots[right_level]); - if (left_blockptr == right_blockptr && - left_gen == right_gen) { - /* - * As we're on a shared block, don't - * allow to go deeper. - */ - advance_left = ADVANCE_ONLY_NEXT; - advance_right = ADVANCE_ONLY_NEXT; - } else { - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } - } else if (left_level < right_level) { - advance_right = ADVANCE; - } else { - advance_left = ADVANCE; - } - } - -out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); - kvfree(tmp_buf); - return ret; -} - /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the @@ -5611,7 +5240,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int slot; struct extent_buffer *c; - WARN_ON(!path->keep_locks); + WARN_ON(!path->keep_locks && !path->skip_locking); while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; @@ -5627,7 +5256,7 @@ next: !path->nodes[level + 1]) return 1; - if (path->locks[level + 1]) { + if (path->locks[level + 1] || path->skip_locking) { level++; continue; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b3642367a595..54efb21c2727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -16,9 +16,9 @@ #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> -#include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/btrfs_tree.h> @@ -28,23 +28,38 @@ #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> +#include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" +#include "block-rsv.h" struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; +struct btrfs_space_info; +struct btrfs_block_group; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; +extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; +struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ -#define BTRFS_MAX_MIRRORS 3 +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1C4 with 4 copies. + */ +#define BTRFS_MAX_MIRRORS (4 + 1) #define BTRFS_MAX_LEVEL 8 @@ -69,9 +84,6 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* four bytes for CRC32 */ -static const int btrfs_csum_sizes[] = { 4 }; - #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ @@ -98,10 +110,6 @@ static inline u32 count_max_extents(u64 size) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); } -struct btrfs_mapping_tree { - struct extent_map_tree map_tree; -}; - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -284,7 +292,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID) + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -387,122 +396,6 @@ struct btrfs_dev_replace { wait_queue_head_t replace_wait; }; -/* For raid type sysfs entries */ -struct raid_kobject { - u64 flags; - struct kobject kobj; - struct list_head list; -}; - -struct btrfs_space_info { - spinlock_t lock; - - u64 total_bytes; /* total bytes in the space, - this doesn't take mirrors into account */ - u64 bytes_used; /* total bytes used, - this doesn't take mirrors into account */ - u64 bytes_pinned; /* total bytes pinned, will be freed when the - transaction finishes */ - u64 bytes_reserved; /* total bytes the allocator has reserved for - current allocations */ - u64 bytes_may_use; /* number of bytes that may be used for - delalloc/allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - - u64 max_extent_size; /* This will hold the maximum extent size of - the space info if we had an ENOSPC in the - allocator. */ - - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - - u64 disk_used; /* total bytes used on disk */ - u64 disk_total; /* total bytes on disk, takes mirrors into - account */ - - u64 flags; - - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - - struct list_head list; - /* Protected by the spinlock 'lock'. */ - struct list_head ro_bgs; - struct list_head priority_tickets; - struct list_head tickets; - /* - * tickets_id just indicates the next ticket will be handled, so note - * it's not stored per ticket. - */ - u64 tickets_id; - - struct rw_semaphore groups_sem; - /* for block groups in our same type */ - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - wait_queue_head_t wait; - - struct kobject kobj; - struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; -}; - -/* - * Types of block reserves - */ -enum { - BTRFS_BLOCK_RSV_GLOBAL, - BTRFS_BLOCK_RSV_DELALLOC, - BTRFS_BLOCK_RSV_TRANS, - BTRFS_BLOCK_RSV_CHUNK, - BTRFS_BLOCK_RSV_DELOPS, - BTRFS_BLOCK_RSV_DELREFS, - BTRFS_BLOCK_RSV_EMPTY, - BTRFS_BLOCK_RSV_TEMP, -}; - -struct btrfs_block_rsv { - u64 size; - u64 reserved; - struct btrfs_space_info *space_info; - spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; - - /* - * Qgroup equivalent for @size @reserved - * - * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care - * about things like csum size nor how many tree blocks it will need to - * reserve. - * - * Qgroup cares more about net change of the extent usage. - * - * So for one newly inserted file extent, in worst case it will cause - * leaf split and level increase, nodesize for each file extent is - * already too much. - * - * In short, qgroup_size/reserved is the upper limit of possible needed - * qgroup metadata reservation. - */ - u64 qgroup_rsv_size; - u64 qgroup_rsv_reserved; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -522,7 +415,7 @@ struct btrfs_free_cluster { /* We did a full search and couldn't create a cluster */ bool fragmented; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; /* * when a cluster is allocated from a block group, we put the * cluster onto a list in the block group so that it can @@ -539,40 +432,6 @@ enum btrfs_caching_type { BTRFS_CACHE_ERROR, }; -enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN, - BTRFS_DC_ERROR, - BTRFS_DC_CLEAR, - BTRFS_DC_SETUP, -}; - -struct btrfs_caching_control { - struct list_head list; - struct mutex mutex; - wait_queue_head_t wait; - struct btrfs_work work; - struct btrfs_block_group_cache *block_group; - u64 progress; - refcount_t count; -}; - -/* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP SZ_2M - -struct btrfs_io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_fs_info *fs_info; - struct inode *inode; - unsigned long size; - int index; - int num_pages; - int entries; - int bitmaps; - unsigned check_crcs:1; -}; - /* * Tree to record all locked full stripes of a RAID5/6 block group */ @@ -581,120 +440,6 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; -struct btrfs_block_group_cache { - struct btrfs_key key; - struct btrfs_block_group_item item; - struct btrfs_fs_info *fs_info; - struct inode *inode; - spinlock_t lock; - u64 pinned; - u64 reserved; - u64 delalloc_bytes; - u64 bytes_super; - u64 flags; - u64 cache_generation; - - /* - * If the free space extent count exceeds this number, convert the block - * group to bitmaps. - */ - u32 bitmap_high_thresh; - - /* - * If the free space extent count drops below this number, convert the - * block group back to extents. - */ - u32 bitmap_low_thresh; - - /* - * It is just used for the delayed data space allocation because - * only the data space allocation and the relative metadata update - * can be done cross the transaction. - */ - struct rw_semaphore data_rwsem; - - /* for raid56, this is a full stripe, without parity */ - unsigned long full_stripe_len; - - unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - - int disk_cache_state; - - /* cache tracking stuff */ - int cached; - struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; - - struct btrfs_space_info *space_info; - - /* free space cache stuff */ - struct btrfs_free_space_ctl *free_space_ctl; - - /* block group cache stuff */ - struct rb_node cache_node; - - /* for block groups in the same raid type */ - struct list_head list; - - /* usage count */ - atomic_t count; - - /* List of struct btrfs_free_clusters for this block group. - * Today it will only have one thing on it, but that may change - */ - struct list_head cluster_list; - - /* For delayed block group creation or deletion of empty block groups */ - struct list_head bg_list; - - /* For read-only block groups */ - struct list_head ro_list; - - atomic_t trimming; - - /* For dirty block groups */ - struct list_head dirty_list; - struct list_head io_list; - - struct btrfs_io_ctl io_ctl; - - /* - * Incremented when doing extent allocations and holding a read lock - * on the space_info's groups_sem semaphore. - * Decremented when an ordered extent that represents an IO against this - * block group's range is created (after it's added to its inode's - * root's list of ordered extents) or immediately after the allocation - * if it's a metadata extent or fallocate extent (for these cases we - * don't create ordered extents). - */ - atomic_t reservations; - - /* - * Incremented while holding the spinlock *lock* by a task checking if - * it can perform a nocow write (incremented if the value for the *ro* - * field is 0). Decremented by such tasks once they create an ordered - * extent or before that if some error happens before reaching that step. - * This is to prevent races between block group relocation and nocow - * writes through direct IO. - */ - atomic_t nocow_writers; - - /* Lock for free space tree operations. */ - struct mutex free_space_lock; - - /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; -}; - /* delayed seq elem */ struct seq_list { struct list_head list; @@ -710,22 +455,6 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash { - struct list_head hash_list; - spinlock_t lock; -}; - -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash_table { - struct list_head stripe_cache; - spinlock_t cache_lock; - int cache_size; - struct btrfs_stripe_hash table[]; -}; - -#define BTRFS_STRIPE_HASH_TABLE_BITS 11 - void btrfs_init_async_reclaim_work(struct work_struct *work); /* fs_info */ @@ -749,8 +478,8 @@ struct btrfs_swapfile_pin { void *ptr; struct inode *inode; /* - * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, - * ptr points to a struct btrfs_device. + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. */ bool is_block_group; }; @@ -785,11 +514,18 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. + * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, }; struct btrfs_fs_info { @@ -823,7 +559,7 @@ struct btrfs_fs_info { struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ - struct btrfs_mapping_tree mapping_tree; + struct extent_map_tree mapping_tree; /* * block reservation for extent, checksum, root tree and @@ -988,7 +724,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *submit_workers; struct btrfs_workqueue *caching_workers; struct btrfs_workqueue *readahead_workers; @@ -1000,21 +735,18 @@ struct btrfs_fs_info { struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; - /* the extent workers do delayed refs on the extent allocation tree */ - struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; struct kobject *space_info_kobj; - struct list_head pending_raid_kobjs; - spinlock_t pending_raid_kobjs_lock; /* uncontended */ u64 total_pinned; /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; + struct percpu_counter dio_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; @@ -1092,10 +824,7 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; - struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; - spinlock_t qgroup_op_lock; - atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1152,12 +881,6 @@ struct btrfs_fs_info { struct mutex unused_bg_unpin_mutex; struct mutex delete_unused_bgs_mutex; - /* - * Chunks that can't be freed yet (under a trim/discard operation) - * and will be latter freed. Protected by fs_info->chunk_mutex. - */ - struct list_head pinned_chunks; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1167,6 +890,14 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; + struct crypto_shash *csum_shash; + + /* + * Number of send operations in progress. + * Updated while holding fs_info::balance_mutex. + */ + int send_in_progress; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1348,6 +1079,12 @@ struct btrfs_root { * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; + /* + * Number of currently running deduplication operations that have a + * destination inode belonging to this root. Protected by the lock + * root_item_lock. + */ + int dedupe_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; atomic_t snapshot_force_cow; @@ -1368,6 +1105,16 @@ struct btrfs_root { #endif }; +struct btrfs_clone_extent_info { + u64 disk_offset; + u64 disk_len; + u64 data_offset; + u64 data_len; + u64 file_offset; + char *extent_buf; + u32 item_size; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -1466,19 +1213,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) btrfs_clear_opt(fs_info->mount_opt, opt); \ } -#ifdef CONFIG_BTRFS_DEBUG -static inline int -btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} -#endif - /* * Requests for changes that need to be done during transaction commit. * @@ -1540,6 +1274,21 @@ do { \ #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + struct btrfs_map_token { const struct extent_buffer *eb; char *kaddr; @@ -1549,8 +1298,10 @@ struct btrfs_map_token { #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sb->s_blocksize_bits) -static inline void btrfs_init_map_token (struct btrfs_map_token *token) +static inline void btrfs_init_map_token(struct btrfs_map_token *token, + struct extent_buffer *eb) { + token->eb = eb; token->kaddr = NULL; } @@ -1581,17 +1332,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ -static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, \ - unsigned long off) \ -{ \ - return btrfs_get_token_##bits(eb, ptr, off, NULL); \ -} \ -static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ - unsigned long off, u##bits val) \ -{ \ - btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ -} +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) @@ -1776,18 +1520,18 @@ static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, } /* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_flags, +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_flags, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ @@ -2133,16 +1877,6 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } -static inline u8 btrfs_key_type(const struct btrfs_key *key) -{ - return key->type; -} - -static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) -{ - key->type = val; -} - /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, @@ -2163,18 +1897,16 @@ static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) return (btrfs_header_flags(eb) & flag) == flag; } -static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); - return (flags & flag) == flag; } -static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); - return (flags & flag) == flag; } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) @@ -2430,14 +2162,10 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_sizes[t]; -} +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __const btrfs_get_num_csums(void); /* @@ -2445,13 +2173,12 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ -static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *leaf) +static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(fs_info); + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); return btrfs_item_offset_nr(leaf, nr - 1); } @@ -2512,30 +2239,6 @@ static inline u32 btrfs_file_extent_inline_item_len( return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } -/* btrfs_dev_stats_item */ -static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, - const struct btrfs_dev_stats_item *ptr, - int index) -{ - u64 val; - - read_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); - return val; -} - -static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, - struct btrfs_dev_stats_item *ptr, - int index, u64 val) -{ - write_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); -} - /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); @@ -2631,6 +2334,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) +static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) +{ + return crc32c(crc, address, length); +} + +static inline void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); @@ -2645,12 +2358,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, return (u64) crc32c(parent_objectid, name, len); } -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) -{ - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -2668,38 +2375,35 @@ enum btrfs_inline_ref_type { int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* - * Doing a truncate won't result in new nodes or leaves, just what we need for - * COW. + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. */ -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start); -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, - unsigned long count, u64 transid, int wait); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); @@ -2711,15 +2415,9 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr); -void btrfs_get_block_group(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2745,13 +2443,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); @@ -2760,35 +2454,11 @@ int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); - -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); + struct btrfs_ref *generic_ref); + int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_free_block_groups(struct btrfs_fs_info *info); -int btrfs_read_block_groups(struct btrfs_fs_info *info); -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, u64 chunk_offset, - u64 size); -void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info); -struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( - struct btrfs_fs_info *fs_info, - const u64 chunk_offset); -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); -void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); +void btrfs_get_block_group_trimming(struct btrfs_block_group *cache); +void btrfs_put_block_group_trimming(struct btrfs_block_group *cache); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2799,6 +2469,7 @@ enum btrfs_reserve_flush_enum { * case, use FLUSH LIMIT */ BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_EVICT, BTRFS_RESERVE_FLUSH_ALL, }; @@ -2811,73 +2482,23 @@ enum btrfs_flush_state { FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, ALLOC_CHUNK_FORCE = 8, - COMMIT_TRANS = 9, + RUN_DELAYED_IPUTS = 9, + COMMIT_TRANS = 10, }; -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len); -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type); -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - bool update_size); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes); -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); @@ -2886,15 +2507,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); -void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end); -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot); -int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); @@ -2912,20 +2529,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); -enum btrfs_compare_tree_result { - BTRFS_COMPARE_TREE_NEW, - BTRFS_COMPARE_TREE_DELETED, - BTRFS_COMPARE_TREE_CHANGED, - BTRFS_COMPARE_TREE_SAME, -}; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t cb, void *ctx); +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot); + int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -2936,10 +2542,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u32 data_size); -void btrfs_truncate_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2967,8 +2571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); @@ -3015,8 +2617,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); +int btrfs_leaf_free_space(struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc); @@ -3177,19 +2778,18 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret); -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, const char *name, - int name_len, - struct btrfs_inode_extref **extref_ret); - +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, @@ -3245,7 +2845,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe); + struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, @@ -3267,14 +2867,14 @@ void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new, - struct btrfs_path *path); + struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *was_new); + struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end, int create); @@ -3310,7 +2910,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int btrfs_is_empty_uuid(u8 *uuid); +int __pure btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); @@ -3340,6 +2940,10 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -3355,12 +2959,6 @@ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); -/* sysfs.c */ -int __init btrfs_init_sysfs(void); -void __cold btrfs_exit_sysfs(void); -int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); @@ -3516,8 +3114,7 @@ __cold static inline void assfail(const char *expr, const char *file, int line) { if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); } } @@ -3547,7 +3144,7 @@ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); -const char *btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, @@ -3601,10 +3198,11 @@ do { \ /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3617,18 +3215,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu feature flag", - flag); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3641,8 +3241,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu feature flag", - flag); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } @@ -3659,10 +3260,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3675,18 +3277,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu ro feature flag", - flag); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3699,8 +3303,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu ro feature flag", - flag); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } @@ -3755,8 +3360,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, - struct btrfs_device *dev); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); static inline void btrfs_init_full_stripe_locks_tree( @@ -3805,6 +3409,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) + /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); @@ -3821,26 +3427,4 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif -static inline void cond_wake_up(struct wait_queue_head *wq) -{ - /* - * This implies a full smp_mb barrier, see comments for - * waitqueue_active why. - */ - if (wq_has_sleeper(wq)) - wake_up(wq); -} - -static inline void cond_wake_up_nomb(struct wait_queue_head *wq) -{ - /* - * Special case for conditional wakeup where the barrier required for - * waitqueue_active is implied by some of the preceding code. Eg. one - * of such atomic operations (atomic_dec_and_return, ...), or a - * unlock/lock sequence, etc. - */ - if (waitqueue_active(wq)) - wake_up(wq); -} - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h deleted file mode 100644 index 90281a7a35a8..000000000000 --- a/fs/btrfs/dedupe.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2016 Fujitsu. All rights reserved. - */ - -#ifndef BTRFS_DEDUPE_H -#define BTRFS_DEDUPE_H - -/* later in-band dedupe will expand this struct */ -struct btrfs_dedupe_hash; - -#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 000000000000..4cdac4d834f5 --- /dev/null +++ b/fs/btrfs/delalloc-space.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "delalloc-space.h" +#include "block-rsv.h" +#include "btrfs_inode.h" +#include "space-info.h" +#include "transaction.h" +#include "qgroup.h" +#include "block-group.h" + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + u64 used; + int ret = 0; + int need_commit = 2; + int have_pinned_space; + + /* Make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, fs_info->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); + } + +again: + /* Make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * If we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_data_alloc_profile(fs_info); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_chunk_alloc(trans, alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else { + have_pinned_space = 1; + goto commit_trans; + } + } + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. + */ + have_pinned_space = __percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + spin_unlock(&data_sinfo->lock); + + /* Commit the current transaction and try again */ +commit_trans: + if (need_commit) { + need_commit--; + + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, + (u64)-1); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (have_pinned_space >= 0 || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || + need_commit > 0) { + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + /* + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. + */ + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; + goto again; + } else { + btrfs_end_transaction(trans); + } + } + + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + spin_unlock(&data_sinfo->lock); + + return 0; +} + +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + + /* align the range */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + if (ret < 0) + return ret; + + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) + btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_space_info *data_sinfo; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + data_sinfo = fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); + spin_unlock(&data_sinfo->lock); +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-inode data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->fs_info->sectorsize) - + round_down(start, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + u64 qgroup_to_release = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); + else + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); +} + +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + + /* + * Insert size for the number of outstanding extents, 1 normal size for + * updating the inode. + */ + if (outstanding_extents) { + reserve_size = btrfs_calc_insert_metadata_size(fs_info, + outstanding_extents); + reserve_size += btrfs_calc_metadata_size(fs_info, 1); + } + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, + csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; + + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; + spin_unlock(&block_rsv->lock); +} + +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, + nr_extents + csum_leaves); + + /* + * finish_ordered_io has to update the inode, so add the space required + * for an inode update. + */ + *meta_reserve += inode_update; + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; + unsigned nr_extents; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + + /* + * If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + return ret; + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); + return ret; + } + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ + spin_lock(&inode->lock); + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + /* Now we can safely add our space to our block rsv */ + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); + + return 0; +} + +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations, or on error for the same reason. + */ +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. + * + * This will do the following things + * + * - reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * - reserve space for metadata space, based on the number of outstanding + * extents and how much csums will be needed + * also reserve metadata space in a per root over-reserve method. + * - add to the inodes->delalloc_bytes + * - add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) + * + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) + */ +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, reserved, start, len); + if (ret < 0) + return ret; + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, *reserved, start, len); + return ret; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @start: start position of the space already reserved + * @len: the len of the space already reserved + * @release_bytes: the len of the space we consumed or didn't use + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. + */ +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free) +{ + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); +} diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 000000000000..54466fbd7075 --- /dev/null +++ b/fs/btrfs/delalloc-space.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DELALLOC_SPACE_H +#define BTRFS_DELALLOC_SPACE_H + +struct extent_changeset; + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); + +#endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c669f250d4a0..d3e15e1d4a91 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,11 +6,13 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" #include "ctree.h" #include "qgroup.h" +#include "locking.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -474,6 +476,9 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; + /* Not associated with any delayed_node */ + if (!delayed_item->delayed_node) + return; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); @@ -555,7 +560,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); /* * Here we migrate space rsv from transaction rsv, since have already @@ -609,7 +614,7 @@ static int btrfs_delayed_inode_reserve_metadata( src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_metadata_size(fs_info, 1); /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction @@ -691,7 +696,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; int free_space; int total_data_size = 0, total_size = 0; @@ -708,7 +712,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, BUG_ON(!path->nodes[0]); leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(fs_info, leaf); + free_space = btrfs_leaf_free_space(leaf); INIT_LIST_HEAD(&head); next = item; @@ -1364,8 +1368,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, - btrfs_async_run_delayed_root, NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, + NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1526,7 +1530,12 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. */ - BUG_ON(ret); + if (ret < 0) { + btrfs_err(trans->fs_info, +"metadata reservation failed for delayed dir item deltiona, should have been reserved"); + btrfs_release_delayed_item(item); + goto end; + } mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); @@ -1535,7 +1544,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", index, node->root->root_key.objectid, node->inode_id, ret); - BUG(); + btrfs_delayed_item_release_metadata(dir->root, item); + btrfs_release_delayed_item(item); } mutex_unlock(&node->mutex); end: @@ -1692,7 +1702,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = btrfs_filetype_table[di->type]; + d_type = fs_ftype_to_dtype(di->type); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, @@ -1940,12 +1950,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } inode_id = delayed_nodes[n - 1]->inode_id + 1; - - for (i = 0; i < n; i++) - refcount_inc(&delayed_nodes[i]->refs); + for (i = 0; i < n; i++) { + /* + * Don't increase refs in case the node is dead and + * about to be removed from the tree in the loop below + */ + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; + } spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7d2a413df90d..df3bd880061d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -10,6 +10,7 @@ #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" +#include "space-info.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * of hammering updates on the extent allocation tree. */ +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; + + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); + + /* + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. + */ + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) +{ + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (val >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans->fs_info); +} + +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); + u64 released = 0; + + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, + NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} + +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + btrfs_space_info_free_bytes_may_use(fs_info, + delayed_refs_rsv->space_info, to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * compare two delayed tree backrefs with same bytenr and type */ @@ -735,8 +909,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, * transaction commits. */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, + struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod) { @@ -746,10 +919,18 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; - bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool is_system; + int action = generic_ref->action; + int level = generic_ref->tree_ref.level; int ret; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; u8 ref_type; + is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); + + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -762,7 +943,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root)) { + is_fstree(generic_ref->real_root) && + is_fstree(generic_ref->tree_ref.root) && + !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -777,13 +960,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); - ref->root = ref_root; + generic_ref->tree_ref.root, action, ref_type); + ref->root = generic_ref->tree_ref.root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - ref_root, 0, action, false, is_system); + generic_ref->tree_ref.root, 0, action, false, + is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -822,10 +1006,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_ref *generic_ref, + u64 reserved, int *old_ref_mod, + int *new_ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -833,9 +1016,17 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int action = generic_ref->action; int ret; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root = generic_ref->data_ref.ref_root; + u64 owner = generic_ref->data_ref.ino; + u64 offset = generic_ref->data_ref.offset; u8 ref_type; + ASSERT(generic_ref->type == BTRFS_REF_DATA && action); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -859,7 +1050,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root)) { + is_fstree(ref_root) && + is_fstree(generic_ref->real_root) && + !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -905,8 +1098,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { @@ -939,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, } /* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. + * This does a simple search for the head node for a given extent. Returns the + * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { + lockdep_assert_held(&delayed_refs->lock); + return find_ref_head(delayed_refs, bytenr, false); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70606da440aa..1c977e6d45dc 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -176,6 +176,83 @@ struct btrfs_delayed_ref_root { u64 qgroup_to_skip; }; +enum btrfs_ref_type { + BTRFS_REF_NOT_SET, + BTRFS_REF_DATA, + BTRFS_REF_METADATA, + BTRFS_REF_LAST, +}; + +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Root which refers to this data extent */ + u64 ref_root; + + /* Inode which refers to this data extent */ + u64 ino; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* + * Root which refers to this tree block. + * + * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) + */ + u64 root; + + /* For non-skinny metadata, no special member needed */ +}; + +struct btrfs_ref { + enum btrfs_ref_type type; + int action; + + /* + * Whether this extent should go through qgroup record. + * + * Normally false, but for certain cases like delayed subtree scan, + * setting this flag can hugely reduce qgroup overhead. + */ + bool skip_qgroup; + + /* + * Optional. For which root is this modification. + * Mostly used for qgroup optimization. + * + * When unset, data/tree ref init code will populate it. + * In certain cases, we're modifying reference for a different root. + * E.g. COW fs tree blocks for balance. + * In that case, tree_ref::root will be fs tree, but we're doing this + * for reloc tree, then we should set @real_root to reloc tree. + */ + u64 real_root; + u64 bytenr; + u64 len; + + /* Bytenr of the parent tree block */ + u64 parent; + union { + struct btrfs_data_ref data_ref; + struct btrfs_tree_ref tree_ref; + }; +}; + extern struct kmem_cache *btrfs_delayed_ref_head_cachep; extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; extern struct kmem_cache *btrfs_delayed_data_ref_cachep; @@ -184,6 +261,38 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); void __cold btrfs_delayed_ref_exit(void); +static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, + int action, u64 bytenr, u64 len, u64 parent) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; +} + +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, + int level, u64 root) +{ + /* If @real_root not set, use @root as fallback */ + if (!generic_ref->real_root) + generic_ref->real_root = root; + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.root = root; + generic_ref->type = BTRFS_REF_METADATA; +} + +static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, + u64 ref_root, u64 ino, u64 offset) +{ + /* If @real_root not set, use @root as fallback */ + if (!generic_ref->real_root) + generic_ref->real_root = ref_root; + generic_ref->data_ref.ref_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; +} + static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) { @@ -224,17 +333,14 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea } int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, + struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action, - int *old_ref_mod, int *new_ref_mod); -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + u64 reserved, int *old_ref_mod, + int *new_ref_mod); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, @@ -258,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + /* * helper functions to cast a node into its container */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee193c5222b2..ba4d8f375b3c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/math64.h> +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -56,7 +57,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) no_valid_dev_replace_entry_found: ret = 0; dev_replace->replace_state = - BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; dev_replace->time_started = 0; @@ -201,7 +202,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { @@ -237,7 +238,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } rcu_assign_pointer(device->name, name); - mutex_lock(&fs_info->fs_devices->device_list_mutex); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; @@ -256,6 +256,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; @@ -273,9 +275,9 @@ error: * called from commit_transaction. Writes changed device replace state to * disk. */ -int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; @@ -399,7 +401,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - bool need_unlock; src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name); @@ -413,11 +414,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return -ETXTBSY; } - ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, - src_device, &tgt_device); - if (ret) - return ret; - /* * Here we commit the transaction to make sure commit_total_bytes * of all the devices are updated. @@ -431,7 +427,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } - need_unlock = true; + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, + src_device, &tgt_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -442,11 +442,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + up_write(&dev_replace->rwsem); goto leave; } dev_replace->cont_reading_from_srcdev_mode = read_src; - WARN_ON(!src_device); dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; @@ -471,7 +471,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -479,16 +478,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* force writing the updated state information to disk */ - trans = btrfs_start_transaction(root, 0); + /* Commit dev_replace state and reserve 1 item for it. */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - need_unlock = true; down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; + up_write(&dev_replace->rwsem); goto leave; } @@ -501,17 +500,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - if (ret == -EINPROGRESS) { + if (ret == -EINPROGRESS) ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else if (ret != -ECANCELED) { - WARN_ON(ret); - } return ret; leave: - if (need_unlock) - up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -603,17 +597,33 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); + /* + * We have to use this loop approach because at this point src_device + * has to be available for transaction commit to complete, yet new + * chunks shouldn't be allocated on the device. + */ + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); + + /* Prevent write_all_supers() during the finishing procedure */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + /* Prevent new chunks being allocated on the source device */ + mutex_lock(&fs_info->chunk_mutex); + + if (!list_empty(&src_device->post_commit_list)) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + break; + } } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); - /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); down_write(&dev_replace->rwsem); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED @@ -662,8 +672,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_device_set_disk_total_bytes(tgt_device, src_device->disk_total_bytes); btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); - ASSERT(list_empty(&src_device->resized_list)); - tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; btrfs_assign_next_active_device(src_device, tgt_device); @@ -696,7 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); - btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -713,7 +721,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; u64 start = 0; @@ -975,7 +983,7 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) return 0; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 4aa40bacc6cc..60b70dacc299 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -9,8 +9,7 @@ struct btrfs_ioctl_dev_replace_args; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); -int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, @@ -18,6 +17,6 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 8de74d835dba..863367c2c620 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -36,7 +36,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(fs_info, path, data_size); + btrfs_extend_item(path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -429,8 +429,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(root->fs_info, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6fe9197f6ee4..e0edfdc9c82b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -19,6 +19,7 @@ #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -39,10 +40,7 @@ #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" - -#ifdef CONFIG_X86 -#include <asm/cpufeature.h> -#endif +#include "block-group.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -207,7 +205,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -215,7 +212,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { - em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); goto out; } @@ -230,7 +226,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; em->block_start = 0; - em->bdev = fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); @@ -249,26 +244,15 @@ out: return em; } -u32 btrfs_csum_data(const char *data, u32 seed, size_t len) -{ - return crc32c(seed, data, len); -} - -void btrfs_csum_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - /* - * compute the csum for a btree block, and either verify it or write it - * into the csum field of the block. + * Compute the csum of a btree block and store the result to provided buffer. + * + * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct btrfs_fs_info *fs_info, - struct extent_buffer *buf, - int verify) +static int csum_tree_block(struct extent_buffer *buf, u8 *result) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char result[BTRFS_CSUM_SIZE]; + struct btrfs_fs_info *fs_info = buf->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -276,9 +260,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, unsigned long map_start; unsigned long map_len; int err; - u32 crc = ~(u32)0; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); len = buf->len - offset; + while (len > 0) { /* * Note: we don't need to check for the err == 1 case here, as @@ -288,34 +275,16 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); - if (err) + if (WARN_ON(err)) return err; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(kaddr + offset - map_start, - crc, cur_len); + crypto_shash_update(shash, kaddr + offset - map_start, cur_len); len -= cur_len; offset += cur_len; } memset(result, 0, BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); - - if (verify) { - if (memcmp_extent_buffer(buf, result, 0, csum_size)) { - u32 val; - u32 found = 0; - memcpy(&found, result, csum_size); - - read_extent_buffer(buf, &val, 0, csum_size); - btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %X found %X level %d", - fs_info->sb->s_id, buf->start, - val, found, btrfs_header_level(buf)); - return -EUCLEAN; - } - } else { - write_extent_buffer(buf, result, 0, csum_size); - } + crypto_shash_final(shash, result); return 0; } @@ -376,6 +345,19 @@ out: return ret; } +static bool btrfs_supported_super_csum(u16 csum_type) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + case BTRFS_CSUM_TYPE_XXHASH: + case BTRFS_CSUM_TYPE_SHA256: + case BTRFS_CSUM_TYPE_BLAKE2: + return true; + default: + return false; + } +} + /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. @@ -385,51 +367,42 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; - u16 csum_type = btrfs_super_csum_type(disk_sb); - int ret = 0; - - if (csum_type == BTRFS_CSUM_TYPE_CRC32) { - u32 crc = ~(u32)0; - char result[sizeof(crc)]; + char result[BTRFS_CSUM_SIZE]; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - /* - * The super_block structure does not span the whole - * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checksum. - */ - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); - if (memcmp(raw_disk_sb, result, sizeof(result))) - ret = 1; - } + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is + * filled with zeros and is included in the checksum. + */ + crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, result); - if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - btrfs_err(fs_info, "unsupported checksum algorithm %u", - csum_type); - ret = 1; - } + if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + return 1; - return ret; + return 0; } -static int verify_level_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) { + struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; struct btrfs_key found_key; int ret; found_level = btrfs_header_level(eb); if (found_level != level) { -#ifdef CONFIG_BTRFS_DEBUG - WARN_ON(1); + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, level, found_level); -#endif return -EIO; } @@ -444,15 +417,25 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, */ if (btrfs_header_generation(eb) > fs_info->last_trans_committed) return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); ret = btrfs_comp_cpu_keys(first_key, &found_key); -#ifdef CONFIG_BTRFS_DEBUG if (ret) { - WARN_ON(1); + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, parent_transid, first_key->objectid, @@ -460,7 +443,6 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, found_key.objectid, found_key.type, found_key.offset); } -#endif return ret; } @@ -472,11 +454,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, +static int btree_read_extent_buffer_pages(struct extent_buffer *eb, u64 parent_transid, int level, struct btrfs_key *first_key) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; int failed = 0; int ret; @@ -487,14 +469,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - mirror_num); + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); if (!ret) { if (verify_parent_transid(io_tree, eb, parent_transid, 0)) ret = -EIO; - else if (verify_level_key(fs_info, eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(eb, level, + first_key, parent_transid)) ret = -EUCLEAN; else break; @@ -519,7 +500,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, } if (failed && !ret && failed_mirror) - repair_eb_io_failure(fs_info, eb, failed_mirror); + btrfs_repair_eb_io_failure(eb, failed_mirror); return ret; } @@ -533,7 +514,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; + u8 result[BTRFS_CSUM_SIZE]; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); struct extent_buffer *eb; + int ret; eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) @@ -552,12 +536,30 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); - return csum_tree_block(fs_info, eb, 0); + if (csum_tree_block(eb, result)) + return -EINVAL; + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf_full(eb); + + if (ret < 0) { + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, + "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; + } + write_extent_buffer(eb, result, 0, csum_size); + + return 0; } -static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static int check_tree_block_fsid(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; @@ -595,7 +597,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret = 0; + u8 result[BTRFS_CSUM_SIZE]; int reads_done; if (!page->private) @@ -606,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks */ - extent_buffer_get(eb); + atomic_inc(&eb->refs); reads_done = atomic_dec_and_test(&eb->io_pages); if (!reads_done) @@ -625,7 +629,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(fs_info, eb)) { + if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; @@ -642,25 +646,44 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(fs_info, eb, 1); + ret = csum_tree_block(eb, result); if (ret) goto err; + if (memcmp_extent_buffer(eb, result, 0, csum_size)) { + u32 val; + u32 found = 0; + + memcpy(&found, result, csum_size); + + read_extent_buffer(eb, &val, 0, csum_size); + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %x found %x level %d", + fs_info->sb->s_id, eb->start, + val, found, btrfs_header_level(eb)); + ret = -EUCLEAN; + goto err; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && btrfs_check_node(fs_info, eb)) + if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; if (!ret) set_extent_buffer_uptodate(eb); + else + btrfs_err(fs_info, + "block=%llu read time tree block corruption detected", + eb->start); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -685,43 +708,31 @@ static void end_workqueue_bio(struct bio *bio) struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; struct btrfs_workqueue *wq; - btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; - func = btrfs_endio_meta_write_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; - func = btrfs_endio_raid56_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } } else { - if (unlikely(end_io_wq->metadata == - BTRFS_WQ_ENDIO_DIO_REPAIR)) { + if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR)) wq = fs_info->endio_repair_workers; - func = btrfs_endio_repair_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; - func = btrfs_endio_raid56_helper; - } else if (end_io_wq->metadata) { + else if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; - func = btrfs_endio_meta_helper; - } else { + else wq = fs_info->endio_workers; - func = btrfs_endio_helper; - } } - btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); btrfs_queue_work(wq, &end_io_wq->work); } @@ -782,8 +793,13 @@ static void run_one_async_done(struct btrfs_work *work) return; } - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, - async->mirror_num, 1); + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + async->bio->bi_opf |= REQ_CGROUP_PUNT; + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); if (ret) { async->bio->bi_status = ret; bio_endio(async->bio); @@ -814,8 +830,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; - btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, - run_one_async_done, run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); async->bio_offset = bio_offset; @@ -832,11 +848,11 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; - int i, ret = 0; + int ret = 0; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) @@ -856,24 +872,22 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_inode *bi) +static int check_async_write(struct btrfs_fs_info *fs_info, + struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) return 0; -#ifdef CONFIG_X86 - if (static_cpu_has(X86_FEATURE_XMM4_2)) + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return 0; -#endif return 1; } -static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) { - struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(BTRFS_I(inode)); + int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -885,20 +899,19 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, - bio_offset, private_data, - btree_submit_bio_start); + 0, inode, btree_submit_bio_start); } if (ret) @@ -1017,46 +1030,17 @@ static const struct address_space_operations btree_aops = { void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = fs_info->btree_inode; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, 0); - free_extent_buffer(buf); -} - -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - struct inode *btree_inode = fs_info->btree_inode; - struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - mirror_num); - if (ret) { - free_extent_buffer(buf); - return ret; - } + return; - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { + ret = read_extent_buffer_pages(buf, WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(buf); + else free_extent_buffer(buf); - } - return 0; } struct extent_buffer *btrfs_find_create_tree_block( @@ -1068,19 +1052,6 @@ struct extent_buffer *btrfs_find_create_tree_block( return alloc_extent_buffer(fs_info, bytenr); } - -int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - -void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. @@ -1100,19 +1071,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + ret = btree_read_extent_buffer_pages(buf, parent_transid, level, first_key); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; } -void clean_tree_block(struct btrfs_fs_info *fs_info, - struct extent_buffer *buf) +void btrfs_clean_tree_block(struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = buf->fs_info; if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1208,7 +1179,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, NULL); + extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1255,9 +1227,9 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) #endif struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 objectid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; @@ -1680,8 +1652,8 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio(bio); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); } static int cleaner_kthread(void *arg) @@ -1776,7 +1748,7 @@ static int transaction_kthread(void *arg) } now = ktime_get_seconds(); - if (cur->state < TRANS_STATE_BLOCKED && + if (cur->state < TRANS_STATE_COMMIT_START && !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { @@ -1815,18 +1787,18 @@ sleep: } /* - * this will find the highest generation in the array of - * root backups. The index of the highest array is returned, - * or -1 if we can't find anything. + * This will find the highest generation in the array of root backups. The + * index of the highest array is returned, or -EINVAL if we can't find + * anything. * * We check to make sure the array is valid by comparing the * generation of the latest root in the array with the generation * in the super block. If they don't match we pitch it. */ -static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +static int find_newest_super_backup(struct btrfs_fs_info *info) { + const u64 newest_gen = btrfs_super_generation(info->super_copy); u64 cur; - int newest_index = -1; struct btrfs_root_backup *root_backup; int i; @@ -1834,37 +1806,10 @@ static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) root_backup = info->super_copy->super_roots + i; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) - newest_index = i; - } - - /* check to see if we actually wrapped around */ - if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { - root_backup = info->super_copy->super_roots; - cur = btrfs_backup_tree_root_gen(root_backup); - if (cur == newest_gen) - newest_index = 0; + return i; } - return newest_index; -} - - -/* - * find the oldest backup so we know where to store new entries - * in the backup array. This will set the backup_root_index - * field in the fs_info struct - */ -static void find_oldest_super_backup(struct btrfs_fs_info *info, - u64 newest_gen) -{ - int newest_index = -1; - newest_index = find_newest_super_backup(info, newest_gen); - /* if there was garbage in there, just move along */ - if (newest_index == -1) { - info->backup_root_index = 0; - } else { - info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; - } + return -EINVAL; } /* @@ -1874,22 +1819,8 @@ static void find_oldest_super_backup(struct btrfs_fs_info *info, */ static void backup_super_roots(struct btrfs_fs_info *info) { - int next_backup; + const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - int last_backup; - - next_backup = info->backup_root_index; - last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - - /* - * just overwrite the last backup if we're at the same generation - * this happens only at umount - */ - root_backup = info->super_for_commit->super_roots + last_backup; - if (btrfs_backup_tree_root_gen(root_backup) == - btrfs_header_generation(info->tree_root->node)) - next_backup = last_backup; root_backup = info->super_for_commit->super_roots + next_backup; @@ -1962,40 +1893,31 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * this copies info out of the root backup array and back into - * the in-memory super block. It is meant to help iterate through - * the array, so you send it the number of backups you've already - * tried and the last backup index you used. + * read_backup_root - Reads a backup root based on the passed priority. Prio 0 + * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * + * fs_info - filesystem whose backup roots need to be read + * priority - priority of backup root required * - * this returns -1 when it has tried all the backups + * Returns backup root index on success and -EINVAL otherwise. */ -static noinline int next_root_backup(struct btrfs_fs_info *info, - struct btrfs_super_block *super, - int *num_backups_tried, int *backup_index) +static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) { + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *super = fs_info->super_copy; struct btrfs_root_backup *root_backup; - int newest = *backup_index; - if (*num_backups_tried == 0) { - u64 gen = btrfs_super_generation(super); + if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { + if (priority == 0) + return backup_index; - newest = find_newest_super_backup(info, gen); - if (newest == -1) - return -1; - - *backup_index = newest; - *num_backups_tried = 1; - } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { - /* we've tried all the backups, all done */ - return -1; + backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; + backup_index %= BTRFS_NUM_BACKUP_ROOTS; } else { - /* jump to the next oldest backup */ - newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - *backup_index = newest; - *num_backups_tried += 1; + return -EINVAL; } - root_backup = super->super_roots + newest; + + root_backup = super->super_roots + backup_index; btrfs_set_super_generation(super, btrfs_backup_tree_root_gen(root_backup)); @@ -2005,12 +1927,13 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); /* - * fixme: the total bytes and num_devices need to match or we should + * Fixme: the total bytes and num_devices need to match or we should * need a fsck */ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); - return 0; + + return backup_index; } /* helper to cleanup workers */ @@ -2025,13 +1948,11 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); - btrfs_destroy_workqueue(fs_info->submit_workers); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); - btrfs_destroy_workqueue(fs_info->extent_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2052,7 +1973,7 @@ static void free_root_extent_buffers(struct btrfs_root *root) } /* helper to cleanup tree roots */ -static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); @@ -2061,7 +1982,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); - if (chunk_root) + if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } @@ -2138,8 +2059,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); - BTRFS_I(inode)->io_tree.track_uptodate = 0; + extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_INODE_IO, inode); + BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; @@ -2162,7 +2084,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; @@ -2191,16 +2112,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->caching_workers = btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue(fs_info, "submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); @@ -2237,13 +2148,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue(fs_info, "extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && + fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_repair_workers && @@ -2251,7 +2158,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { return -ENOMEM; } @@ -2259,6 +2165,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, return 0; } +static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +{ + struct crypto_shash *csum_shash; + const char *csum_driver = btrfs_super_csum_driver(csum_type); + + csum_shash = crypto_alloc_shash(csum_driver, 0, 0); + + if (IS_ERR(csum_shash)) { + btrfs_err(fs_info, "error allocating %s hash for checksum", + csum_driver); + return PTR_ERR(csum_shash); + } + + fs_info->csum_shash = csum_shash; + + return 0; +} + +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + crypto_free_shash(fs_info->csum_shash); +} + static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2574,7 +2503,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); @@ -2595,7 +2524,101 @@ out: return ret; } -int open_ctree(struct super_block *sb, +static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) +{ + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *sb = fs_info->super_copy; + struct btrfs_root *tree_root = fs_info->tree_root; + bool handle_error = false; + int ret = 0; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + u64 generation; + int level; + + if (handle_error) { + if (!IS_ERR(tree_root->node)) + free_extent_buffer(tree_root->node); + tree_root->node = NULL; + + if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) + break; + + free_root_pointers(fs_info, 0); + + /* + * Don't use the log in recovery mode, it won't be + * valid + */ + btrfs_set_super_log_root(sb, 0); + + /* We can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = read_backup_root(fs_info, i); + backup_index = ret; + if (ret < 0) + return ret; + } + generation = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), + generation, level, NULL); + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + + if (IS_ERR(tree_root->node)) + ret = PTR_ERR(tree_root->node); + else if (!extent_buffer_uptodate(tree_root->node)) + ret = -EUCLEAN; + + btrfs_warn(fs_info, "failed to read tree root"); + continue; + } + + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); + + /* + * No need to hold btrfs_root::objectid_mutex since the fs + * hasn't been fully initialised and we are the only user + */ + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret < 0) { + handle_error = true; + continue; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + ret = btrfs_read_roots(fs_info); + if (ret < 0) { + handle_error = true; + continue; + } + + /* All successful */ + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + + /* Always begin writing backup roots after the one being used */ + if (backup_index < 0) { + fs_info->backup_root_index = 0; + } else { + fs_info->backup_root_index = backup_index + 1; + fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; + } + break; + } + + return ret; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -2604,6 +2627,7 @@ int open_ctree(struct super_block *sb, u32 stripesize; u64 generation; u64 features; + u16 csum_type; struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; @@ -2612,8 +2636,6 @@ int open_ctree(struct super_block *sb, struct btrfs_root *chunk_root; int ret; int err = -EINVAL; - int num_backups_tried = 0; - int backup_index = 0; int clear_free_space_tree = 0; int level; @@ -2630,11 +2652,17 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_srcu; } + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) { + err = ret; + goto fail_dio_bytes; + } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2658,8 +2686,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); - spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); @@ -2667,7 +2693,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); @@ -2681,7 +2706,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); - btrfs_mapping_init(&fs_info->mapping_tree); + extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2694,7 +2719,6 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); @@ -2748,8 +2772,10 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], NULL); - extent_io_tree_init(&fs_info->freed_extents[1], NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[0], + IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[1], + IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -2776,8 +2802,6 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); - INIT_LIST_HEAD(&fs_info->pinned_chunks); - /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; @@ -2786,6 +2810,8 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + fs_info->send_in_progress = 0; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2806,6 +2832,25 @@ int open_ctree(struct super_block *sb, } /* + * Verify the type first, if that or the the checksum value are + * corrupted, we'll find out + */ + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + csum_type); + err = -EINVAL; + brelse(bh); + goto fail_alloc; + } + + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { + err = ret; + goto fail_alloc; + } + + /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ @@ -2813,7 +2858,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_alloc; + goto fail_csum; } /* @@ -2850,24 +2895,17 @@ int open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } if (!btrfs_super_root(disk_super)) - goto fail_alloc; + goto fail_csum; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* - * run through our array of backup supers and setup - * our ring pointer to the oldest one - */ - generation = btrfs_super_generation(disk_super); - find_oldest_super_backup(fs_info, generation); - - /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ @@ -2876,7 +2914,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super) & @@ -2886,7 +2924,7 @@ int open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super); @@ -2930,7 +2968,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_alloc; + goto fail_csum; } /* @@ -2946,7 +2984,7 @@ int open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3013,44 +3051,9 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } -retry_root_backup: - generation = btrfs_super_generation(disk_super); - level = btrfs_super_root_level(disk_super); - - tree_root->node = read_tree_block(fs_info, - btrfs_super_root(disk_super), - generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { - btrfs_warn(fs_info, "failed to read tree root"); - if (!IS_ERR(tree_root->node)) - free_extent_buffer(tree_root->node); - tree_root->node = NULL; - goto recovery_tree_root; - } - - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - - mutex_lock(&tree_root->objectid_mutex); - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); - if (ret) { - mutex_unlock(&tree_root->objectid_mutex); - goto recovery_tree_root; - } - - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); - - mutex_unlock(&tree_root->objectid_mutex); - - ret = btrfs_read_roots(fs_info); + ret = init_tree_roots(fs_info); if (ret) - goto recovery_tree_root; - - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + goto fail_tree_roots; ret = btrfs_verify_dev_extents(fs_info); if (ret) { @@ -3318,12 +3321,14 @@ fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); +fail_csum: + btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3335,30 +3340,14 @@ fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); +fail_dio_bytes: + percpu_counter_destroy(&fs_info->dio_bytes); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; - -recovery_tree_root: - if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) - goto fail_tree_roots; - - free_root_pointers(fs_info, 0); - - /* don't use the log in recovery mode, it won't be valid */ - btrfs_set_super_log_root(disk_super, 0); - - /* we can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - - ret = next_root_backup(fs_info, fs_info->super_copy, - &num_backups_tried, &backup_index); - if (ret == -1) - goto fail_block_groups; - goto retry_root_backup; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); @@ -3463,17 +3452,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { + struct btrfs_fs_info *fs_info = device->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct buffer_head *bh; int i; int ret; int errors = 0; - u32 crc; u64 bytenr; int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3482,10 +3474,10 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); + crypto_shash_init(shash); + crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, sb->csum); /* One reference for us, and we leave it for the caller */ bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, @@ -3700,7 +3692,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); @@ -3709,7 +3701,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } @@ -3949,7 +3941,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) return btrfs_commit_transaction(trans); } -void close_ctree(struct btrfs_fs_info *fs_info) +void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; @@ -4016,6 +4008,10 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } + if (percpu_counter_sum(&fs_info->dio_bytes)) + btrfs_info(fs_info, "at unmount dio bytes count %lld", + percpu_counter_sum(&fs_info->dio_bytes)); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -4033,7 +4029,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_free_block_groups(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); iput(fs_info->btree_inode); @@ -4042,25 +4038,18 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfsic_unmount(fs_info->fs_devices); #endif - btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_close_devices(fs_info->fs_devices); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); + btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); - - while (!list_empty(&fs_info->pinned_chunks)) { - struct extent_map *em; - - em = list_first_entry(&fs_info->pinned_chunks, - struct extent_map, list); - list_del_init(&em->list); - free_extent_map(em); - } } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4114,7 +4103,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) * So here we should only check item pointers, not item data. */ if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(fs_info, buf)) { + btrfs_check_leaf_relaxed(buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4157,10 +4146,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - - return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + return btree_read_extent_buffer_pages(buf, parent_transid, level, first_key); } @@ -4420,7 +4406,7 @@ again: return 0; } -static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) +static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) { struct inode *inode; @@ -4437,12 +4423,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, dirty_list); if (!list_empty(&cache->io_list)) { @@ -4470,7 +4456,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, io_list); list_del_init(&cache->io_list); @@ -4484,10 +4470,17 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { + struct btrfs_device *dev, *tmp; + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); + list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list, + post_commit_list) { + list_del_init(&dev->post_commit_list); + } + btrfs_destroy_delayed_refs(cur_trans, fs_info); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 987a64bc0c66..76f123ebb292 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,20 +39,20 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); -void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf); -int open_ctree(struct super_block *sb, +void btrfs_clean_tree_block(struct extent_buffer *buf); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -void close_ctree(struct btrfs_fs_info *fs_info); +void __cold close_ctree(struct btrfs_fs_info *fs_info); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, @@ -113,8 +113,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -u32 btrfs_csum_data(const char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, u8 *result); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, @@ -123,8 +121,6 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); -int btrfs_write_tree_block(struct extent_buffer *buf); -void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, @@ -134,7 +130,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ddf28ecf17f9..72e312cae69d 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -87,7 +87,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, root, NULL); + inode = btrfs_iget(sb, &key, root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail; @@ -214,7 +214,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root)); fail: btrfs_free_path(path); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h new file mode 100644 index 000000000000..a3febe746c79 --- /dev/null +++ b/fs/btrfs/extent-io-tree.h @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_IO_TREE_H +#define BTRFS_EXTENT_IO_TREE_H + +struct extent_changeset; +struct io_failure_record; + +/* Bits for the extent state */ +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_UPTODATE (1U << 1) +#define EXTENT_LOCKED (1U << 2) +#define EXTENT_NEW (1U << 3) +#define EXTENT_DELALLOC (1U << 4) +#define EXTENT_DEFRAG (1U << 5) +#define EXTENT_BOUNDARY (1U << 6) +#define EXTENT_NODATASUM (1U << 7) +#define EXTENT_CLEAR_META_RESV (1U << 8) +#define EXTENT_NEED_WAIT (1U << 9) +#define EXTENT_DAMAGED (1U << 10) +#define EXTENT_NORESERVE (1U << 11) +#define EXTENT_QGROUP_RESERVED (1U << 12) +#define EXTENT_CLEAR_DATA_RESV (1U << 13) +#define EXTENT_DELALLOC_NEW (1U << 14) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) + +/* + * Redefined bits above which are used only in the device allocation tree, + * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV + * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit + * manipulation functions + */ +#define CHUNK_ALLOCATED EXTENT_DIRTY +#define CHUNK_TRIMMED EXTENT_DEFRAG + +enum { + IO_TREE_FS_INFO_FREED_EXTENTS0, + IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_INODE_IO, + IO_TREE_INODE_IO_FAILURE, + IO_TREE_RELOC_BLOCKS, + IO_TREE_TRANS_DIRTY_PAGES, + IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_SELFTEST, +}; + +struct extent_io_tree { + struct rb_root state; + struct btrfs_fs_info *fs_info; + void *private_data; + u64 dirty_bytes; + bool track_uptodate; + + /* Who owns this io tree, should be one of IO_TREE_* */ + u8 owner; + + spinlock_t lock; + const struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + wait_queue_head_t wq; + refcount_t refs; + unsigned state; + + struct io_failure_record *failrec; + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +int __init extent_state_cache_init(void); +void __cold extent_state_cache_exit(void); + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, + void *private_data); +void extent_io_tree_release(struct extent_io_tree *tree); + +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); + +int __init extent_io_init(void); +void __cold extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned bits, int contig); + +void free_extent_state(struct extent_state *state); +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int filled, + struct extent_state *cached_state); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_changeset *changeset); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_NOFS, NULL); +} + +static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, + u64 start, u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_ATOMIC, NULL); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); +} + +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_changeset *changeset); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state) +{ + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, GFP_NOFS, NULL); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, cached); +} + +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, unsigned clear_bits, + struct extent_state **cached_state); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, unsigned int extra_bits, + struct extent_state **cached_state) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, + NULL, cached_state, GFP_NOFS); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, GFP_NOFS); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits, + struct extent_state **cached_state); +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state); + +/* This should be reworked in the future and put elsewhere. */ +int get_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record **failrec); +int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec); +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, + u64 end); +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); + +#endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c5880329ae37..274318e9114e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,6 +16,7 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> +#include "misc.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -24,50 +25,16 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "math.h" #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "space-info.h" +#include "block-rsv.h" +#include "delalloc-space.h" +#include "block-group.h" #undef SCRAMBLE_DELAYED_REFS -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum { - CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_LIMITED = 1, - CHUNK_ALLOC_FORCE = 2, -}; - -/* - * Declare a helper function to detect underflow of various space info members - */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ -static inline void update_##name(struct btrfs_space_info *sinfo, \ - s64 bytes) \ -{ \ - if (bytes < 0 && sinfo->name < -bytes) { \ - WARN_ON(1); \ - sinfo->name = 0; \ - return; \ - } \ - sinfo->name += bytes; \ -} - -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, @@ -84,148 +51,16 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); - -static noinline int -block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED || - cache->cached == BTRFS_CACHE_ERROR; -} -static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +static int block_group_bits(struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) -{ - atomic_inc(&cache->count); -} - -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) { - WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); - - /* - * If not empty, someone is still holding mutex of - * full_stripe_lock, which can only be released by caller. - * And it will definitely cause use-after-free when caller - * tries to release full stripe lock. - * - * No better way to resolve, but only to warn. - */ - WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); - kfree(cache->free_space_ctl); - kfree(cache); - } -} - -/* - * this adds the block group to the fs_info rb tree for the block group - * cache - */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; - - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { - p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { - p = &(*p)->rb_right; - } else { - spin_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); - - if (info->first_logical_byte > block_group->key.objectid) - info->first_logical_byte = block_group->key.objectid; - - spin_unlock(&info->block_group_cache_lock); - - return 0; -} - -/* - * This will return the block group at or after bytenr if contains is 0, else - * it will return the block group that contains the bytenr - */ -static struct btrfs_block_group_cache * -block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, - int contains) -{ - struct btrfs_block_group_cache *cache, *ret = NULL; - struct rb_node *n; - u64 end, start; - - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; - - while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; - - if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) - ret = cache; - n = n->rb_left; - } else if (bytenr > start) { - if (contains && bytenr <= end) { - ret = cache; - break; - } - n = n->rb_right; - } else { - ret = cache; - break; - } - } - if (ret) { - btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) - info->first_logical_byte = ret->key.objectid; - } - spin_unlock(&info->block_group_cache_lock); - - return ret; -} - -static int add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; set_extent_bits(&fs_info->freed_extents[0], @@ -235,13 +70,13 @@ static int add_excluded_extent(struct btrfs_fs_info *fs_info, return 0; } -static void free_excluded_extents(struct btrfs_block_group_cache *cache) +void btrfs_free_excluded_extents(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; u64 start, end; - start = cache->key.objectid; - end = start + cache->key.offset - 1; + start = cache->start; + end = start + cache->length - 1; clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_UPTODATE); @@ -249,547 +84,39 @@ static void free_excluded_extents(struct btrfs_block_group_cache *cache) start, end, EXTENT_UPTODATE); } -static int exclude_super_stripes(struct btrfs_block_group_cache *cache) +static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; - cache->bytes_super += stripe_len; - ret = add_excluded_extent(fs_info, cache->key.objectid, - stripe_len); - if (ret) - return ret; - } - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->key.objectid, - bytenr, &logical, &nr, &stripe_len); - if (ret) - return ret; - - while (nr--) { - u64 start, len; - - if (logical[nr] > cache->key.objectid + - cache->key.offset) - continue; - - if (logical[nr] + stripe_len <= cache->key.objectid) - continue; - - start = logical[nr]; - if (start < cache->key.objectid) { - start = cache->key.objectid; - len = (logical[nr] + stripe_len) - start; - } else { - len = min_t(u64, stripe_len, - cache->key.objectid + - cache->key.offset - start); - } - - cache->bytes_super += len; - ret = add_excluded_extent(fs_info, start, len); - if (ret) { - kfree(logical); - return ret; - } - } - - kfree(logical); - } - return 0; -} - -static struct btrfs_caching_control * -get_caching_control(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *ctl; - - spin_lock(&cache->lock); - if (!cache->caching_ctl) { - spin_unlock(&cache->lock); - return NULL; - } - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - spin_unlock(&cache->lock); - return ctl; -} - -static void put_caching_control(struct btrfs_caching_control *ctl) -{ - if (refcount_dec_and_test(&ctl->count)) - kfree(ctl); -} - -#ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 start = block_group->key.objectid; - u64 len = block_group->key.offset; - u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? - fs_info->nodesize : fs_info->sectorsize; - u64 step = chunk << 1; - - while (len > chunk) { - btrfs_remove_free_space(block_group, start, chunk); - start += step; - if (len < step) - len = 0; + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) + return BTRFS_BLOCK_GROUP_SYSTEM; else - len -= step; - } -} -#endif - -/* - * this is only called by cache_block_group, since we could have freed extents - * we need to check the pinned_extents for any extents that can't be used yet - * since their free space will be released as soon as the transaction commits. - */ -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end) -{ - struct btrfs_fs_info *info = block_group->fs_info; - u64 extent_start, extent_end, size, total_added = 0; - int ret; - - while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) - break; - - if (extent_start <= start) { - start = extent_end + 1; - } else if (extent_start > start && extent_start < end) { - size = extent_start - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); - BUG_ON(ret); /* -ENOMEM or logic error */ - start = extent_end + 1; - } else { - break; - } - } - - if (start < end) { - size = end - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ - } - - return total_added; -} - -static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) -{ - struct btrfs_block_group_cache *block_group = caching_ctl->block_group; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - u64 total_found = 0; - u64 last = 0; - u32 nritems; - int ret; - bool wakeup = true; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); - -#ifdef CONFIG_BTRFS_DEBUG - /* - * If we're fragmenting we don't want to make anybody think we can - * allocate from this block group until we've had a chance to fragment - * the free space. - */ - if (btrfs_should_fragment_free_space(block_group)) - wakeup = false; -#endif - /* - * We don't want to deadlock with somebody trying to allocate a new - * extent for the extent root while also trying to search the extent - * root to add free space. So we skip locking and search the commit - * root, since its read-only - */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; - - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; - -next: - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (btrfs_fs_closing(fs_info) > 1) { - last = (u64)-1; - break; - } - - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - } else { - ret = find_next_key(path, 0, &key); - if (ret) - break; - - if (need_resched() || - rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - up_read(&fs_info->commit_root_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - goto next; - } - - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - if (key.objectid < last) { - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; - - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - goto next; - } - - if (key.objectid < block_group->key.objectid) { - path->slots[0]++; - continue; - } - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, last, - key.objectid); - if (key.type == BTRFS_METADATA_ITEM_KEY) - last = key.objectid + - fs_info->nodesize; - else - last = key.objectid + key.offset; - - if (total_found > CACHING_CTL_WAKE_UP) { - total_found = 0; - if (wakeup) - wake_up(&caching_ctl->wait); - } - } - path->slots[0]++; - } - ret = 0; - - total_found += add_new_free_space(block_group, last, - block_group->key.objectid + - block_group->key.offset); - caching_ctl->progress = (u64)-1; - -out: - btrfs_free_path(path); - return ret; -} - -static noinline void caching_thread(struct btrfs_work *work) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; - int ret; - - caching_ctl = container_of(work, struct btrfs_caching_control, work); - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; - - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - ret = load_free_space_tree(caching_ctl); - else - ret = load_extent_tree_free(caching_ctl); - - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; - spin_unlock(&block_group->lock); - -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(block_group)) { - u64 bytes_used; - - spin_lock(&block_group->space_info->lock); - spin_lock(&block_group->lock); - bytes_used = block_group->key.offset - - btrfs_block_group_used(&block_group->item); - block_group->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&block_group->lock); - spin_unlock(&block_group->space_info->lock); - fragment_free_space(block_group); + return BTRFS_BLOCK_GROUP_METADATA; } -#endif - - caching_ctl->progress = (u64)-1; - - up_read(&fs_info->commit_root_sem); - free_excluded_extents(block_group); - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - - put_caching_control(caching_ctl); - btrfs_put_block_group(block_group); + return BTRFS_BLOCK_GROUP_DATA; } -static int cache_block_group(struct btrfs_block_group_cache *cache, - int load_cache_only) -{ - DEFINE_WAIT(wait); - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - if (!caching_ctl) - return -ENOMEM; - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - refcount_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, - caching_thread, NULL, NULL); - - spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); - - schedule(); - - finish_wait(&ctl->wait, &wait); - put_caching_control(ctl); - spin_lock(&cache->lock); - } - - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); - return 0; - } - WARN_ON(cache->caching_ctl); - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(fs_info, cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - caching_ctl->progress = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - } - spin_unlock(&cache->lock); -#ifdef CONFIG_BTRFS_DEBUG - if (ret == 1 && - btrfs_should_fragment_free_space(cache)) { - u64 bytes_used; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - bytes_used = cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fragment_free_space(cache); - } -#endif - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - if (ret == 1) { - put_caching_control(caching_ctl); - free_excluded_extents(cache); - return 0; - } - } else { - /* - * We're either using the free space tree or no caching at all. - * Set cached to the appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - } - - if (load_cache_only) { - put_caching_control(caching_ctl); - return 0; - } - - down_write(&fs_info->commit_root_sem); - refcount_inc(&caching_ctl->count); - list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->commit_root_sem); - - btrfs_get_block_group(cache); - - btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); - - return ret; -} - -/* - * return the block group that starts at or after bytenr - */ -static struct btrfs_block_group_cache * -btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) -{ - return block_group_cache_tree_search(info, bytenr, 0); -} - -/* - * return the block group that contains the given bytenr - */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr) -{ - return block_group_cache_tree_search(info, bytenr, 1); -} - -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); - return found; - } - } - rcu_read_unlock(); - return NULL; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, - bool metadata, u64 root_objectid) +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { struct btrfs_space_info *space_info; - u64 flags; - - if (metadata) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } + u64 flags = generic_ref_to_space_flags(ref); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); } -/* - * after adding space to the filesystem, we need to clear the full flags - * on all the space infos. - */ -void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) - found->full = 0; - rcu_read_unlock(); + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } /* simple helper to search for an existing data extent at a given offset */ @@ -1111,18 +438,18 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, return BTRFS_REF_TYPE_INVALID; } -static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { u32 high_crc = ~(u32)0; u32 low_crc = ~(u32)0; __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -1704,7 +1031,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(fs_info, path, size); + btrfs_extend_item(path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1779,7 +1106,6 @@ void update_inline_extent_backref(struct btrfs_path *path, int *last_ref) { struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1834,7 +1160,7 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -1905,7 +1231,6 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { @@ -1981,8 +1306,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { - int ret; + int ret = 0; u64 discarded_bytes = 0; + u64 end = bytenr + num_bytes; + u64 cur = bytenr; struct btrfs_bio *bbio = NULL; @@ -1991,15 +1318,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, - &bbio, 0); - /* Error condition is -ENOMEM */ - if (!ret) { - struct btrfs_bio_stripe *stripe = bbio->stripes; + while (cur < end) { + struct btrfs_bio_stripe *stripe; int i; + num_bytes = end - cur; + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, + &num_bytes, &bbio, 0); + /* + * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or + * -EOPNOTSUPP. For any such error, @num_bytes is not updated, + * thus we can't continue anyway. + */ + if (ret < 0) + goto out; + stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; struct request_queue *req_q; @@ -2016,10 +1351,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe->physical, stripe->length, &bytes); - if (!ret) + if (!ret) { discarded_bytes += bytes; - else if (ret != -EOPNOTSUPP) - break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + } else if (ret != -EOPNOTSUPP) { + /* + * Logic errors or -ENOMEM, or -EIO, but + * unlikely to happen. + * + * And since there are two loops, explicitly + * go to out to avoid confusion. + */ + btrfs_put_bbio(bbio); + goto out; + } /* * Just in case we get back EOPNOTSUPP for some reason, @@ -2029,7 +1373,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, ret = 0; } btrfs_put_bbio(bbio); + cur += num_bytes; } +out: btrfs_bio_counter_dec(fs_info); if (actual_bytes) @@ -2043,39 +1389,28 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + struct btrfs_ref *generic_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && - root_objectid == BTRFS_TREE_LOG_OBJECTID); - - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, - owner, offset, BTRFS_ADD_DELAYED_REF); + ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && + generic_ref->action); + BUG_ON(generic_ref->type == BTRFS_REF_METADATA && + generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, - &old_ref_mod, &new_ref_mod); - } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_ADD_DELAYED_REF, + if (generic_ref->type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, + NULL, &old_ref_mod, &new_ref_mod); + else + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, &old_ref_mod, &new_ref_mod); - } - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + btrfs_ref_tree_mod(fs_info, generic_ref); - add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + sub_pinned_bytes(fs_info, generic_ref); return ret; } @@ -2472,7 +1807,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, -head->num_bytes, @@ -2534,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); + ret = btrfs_del_csums(trans, fs_info->csum_root, + head->bytenr, head->num_bytes); } } @@ -2834,140 +2169,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - bool ret = false; - u64 reserved; - - spin_lock(&global_rsv->lock); - reserved = global_rsv->reserved; - spin_unlock(&global_rsv->lock); - - /* - * Since the global reserve is just kind of magic we don't really want - * to rely on it to save our bacon, so if our size is more than the - * delayed_refs_rsv and the global rsv then it's time to think about - * bailing. - */ - spin_lock(&delayed_refs_rsv->lock); - reserved += delayed_refs_rsv->reserved; - if (delayed_refs_rsv->size >= reserved) - ret = true; - spin_unlock(&delayed_refs_rsv->lock); - return ret; -} - -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - -struct async_delayed_refs { - struct btrfs_root *root; - u64 transid; - int count; - int error; - int sync; - struct completion wait; - struct btrfs_work work; -}; - -static inline struct async_delayed_refs * -to_async_delayed_refs(struct btrfs_work *work) -{ - return container_of(work, struct async_delayed_refs, work); -} - -static void delayed_ref_async_start(struct btrfs_work *work) -{ - struct async_delayed_refs *async = to_async_delayed_refs(work); - struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = async->root->fs_info; - int ret; - - /* if the commit is already started, we don't need to wait here */ - if (btrfs_transaction_blocked(fs_info)) - goto done; - - trans = btrfs_join_transaction(async->root); - if (IS_ERR(trans)) { - async->error = PTR_ERR(trans); - goto done; - } - - /* - * trans->sync means that when we call end_transaction, we won't - * wait on delayed refs - */ - trans->sync = true; - - /* Don't bother flushing if we got into a different transaction */ - if (trans->transid > async->transid) - goto end; - - ret = btrfs_run_delayed_refs(trans, async->count); - if (ret) - async->error = ret; -end: - ret = btrfs_end_transaction(trans); - if (ret && !async->error) - async->error = ret; -done: - if (async->sync) - complete(&async->wait); - else - kfree(async); -} - -int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, - unsigned long count, u64 transid, int wait) -{ - struct async_delayed_refs *async; - int ret; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return -ENOMEM; - - async->root = fs_info->tree_root; - async->count = count; - async->error = 0; - async->transid = transid; - if (wait) - async->sync = 1; - else - async->sync = 0; - init_completion(&async->wait); - - btrfs_init_work(&async->work, btrfs_extent_refs_helper, - delayed_ref_async_start, NULL, NULL); - - btrfs_queue_work(fs_info->extent_workers, &async->work); - - if (wait) { - wait_for_completion(&async->wait); - ret = async->error; - kfree(async); - return ret; - } - return 0; -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -3036,7 +2237,6 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data) { @@ -3053,8 +2253,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, - num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -3179,16 +2378,19 @@ static noinline int check_committed_ref(struct btrfs_root *root, item_size = btrfs_item_size_nr(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + /* If extent item has more than 1 inline ref then it's shared */ if (item_size != sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; + /* If extent created before last snapshot => it's definitely shared */ if (btrfs_extent_generation(leaf, ei) <= btrfs_root_last_snapshot(&root->root_item)) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); + /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) goto out; @@ -3246,13 +2448,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; + struct btrfs_ref generic_ref = { 0 }; + bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; + int action; int level; int ret = 0; - int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_root *, - u64, u64, u64, u64, u64, u64); - if (btrfs_is_testing(fs_info)) return 0; @@ -3264,15 +2465,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; - if (inc) - process_func = btrfs_inc_extent_ref; - else - process_func = btrfs_free_extent; - if (full_backref) parent = buf->start; else parent = 0; + if (inc) + action = BTRFS_ADD_DELAYED_REF; + else + action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { if (level == 0) { @@ -3290,16 +2490,30 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, key.objectid, - key.offset); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, + key.offset); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } @@ -3321,561 +2535,9 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - struct btrfs_block_group_cache *cache) -{ - int ret; - struct btrfs_root *extent_root = fs_info->extent_root; - unsigned long bi; - struct extent_buffer *leaf; - - ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto fail; - } - - leaf = path->nodes[0]; - bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); - btrfs_mark_buffer_dirty(leaf); -fail: - btrfs_release_path(path); - return ret; - -} - -static struct btrfs_block_group_cache * -next_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) -{ - struct rb_node *node; - - spin_lock(&fs_info->block_group_cache_lock); - - /* If our block group was removed, we need a full search. */ - if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->key.objectid + cache->key.offset; - - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; - } - node = rb_next(&cache->cache_node); - btrfs_put_block_group(cache); - if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); - btrfs_get_block_group(cache); - } else - cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); - return cache; -} - -static int cache_save_setup(struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; - struct inode *inode = NULL; - struct extent_changeset *data_reserved = NULL; - u64 alloc_hint = 0; - int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; - int retries = 0; - int ret = 0; - - /* - * If this block group is smaller than 100 megs don't bother caching the - * block group. - */ - if (block_group->key.offset < (100 * SZ_1M)) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } - - if (trans->aborted) - return 0; -again: - inode = lookup_free_space_inode(fs_info, block_group, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { - ret = PTR_ERR(inode); - btrfs_release_path(path); - goto out; - } - - if (IS_ERR(inode)) { - BUG_ON(retries); - retries++; - - if (block_group->ro) - goto out_free; - - ret = create_free_space_inode(fs_info, trans, block_group, - path); - if (ret) - goto out_free; - goto again; - } - - /* - * We want to set the generation to 0, that way if anything goes wrong - * from here on out we know not to trust this cache when we load up next - * time. - */ - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - /* - * So theoretically we could recover from this, simply set the - * super cache generation to 0 so we know to invalidate the - * cache, but then we'd have to keep track of the block groups - * that fail this way so we know we _have_ to reset this cache - * before the next commit or risk reading stale cache. So to - * limit our exposure to horrible edge cases lets just abort the - * transaction, this only happens in really bad situations - * anyway. - */ - btrfs_abort_transaction(trans, ret); - goto out_put; - } - WARN_ON(ret); - - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_check_trunc_cache_free_space(fs_info, - &fs_info->global_block_rsv); - if (ret) - goto out_put; - - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) - goto out_put; - } - - spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(fs_info, SPACE_CACHE)) { - /* - * don't bother trying to write stuff out _if_ - * a) we're not cached, - * b) we're with nospace_cache mount option, - * c) we're with v2 space_cache (FREE_SPACE_TREE). - */ - dcs = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - goto out_put; - } - spin_unlock(&block_group->lock); - - /* - * We hit an ENOSPC when setting up the cache in this transaction, just - * skip doing the setup, we've already cleared the cache so we're safe. - */ - if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { - ret = -ENOSPC; - goto out_put; - } - - /* - * Try to preallocate enough space based on how big the block group is. - * Keep in mind this has to include any pinned space which could end up - * taking up quite a bit since it's not folded into the other space - * cache. - */ - num_pages = div_u64(block_group->key.offset, SZ_256M); - if (!num_pages) - num_pages = 1; - - num_pages *= 16; - num_pages *= PAGE_SIZE; - - ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, - &alloc_hint); - /* - * Our cache requires contiguous chunks so that we don't modify a bunch - * of metadata or split extents when writing the cache out, which means - * we can enospc if we are heavily fragmented in addition to just normal - * out of space conditions. So if we hit this just skip setting up any - * other block groups for this transaction, maybe we'll unpin enough - * space the next time around. - */ - if (!ret) - dcs = BTRFS_DC_SETUP; - else if (ret == -ENOSPC) - set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - -out_put: - iput(inode); -out_free: - btrfs_release_path(path); -out: - spin_lock(&block_group->lock); - if (!ret && dcs == BTRFS_DC_SETUP) - block_group->cache_generation = trans->transid; - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - - extent_changeset_free(data_reserved); - return ret; -} - -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_group_cache *cache, *tmp; - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(fs_info, SPACE_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Could add new block groups, use _safe just in case */ - list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, - dirty_list) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - } - - btrfs_free_path(path); - return 0; -} - -/* - * transaction commit does final block group cache writeback during a - * critical section where nothing is allowed to change the FS. This is - * required in order for the cache to actually match the block group, - * but can introduce a lot of latency into the commit. - * - * So, btrfs_start_dirty_block_groups is here to kick off block group - * cache IO. There's a chance we'll have to redo some of it if the - * block group changes again during the commit, but it greatly reduces - * the commit latency by getting rid of the easy block groups while - * we're still allowing others to join the commit. - */ -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path = NULL; - LIST_HEAD(dirty); - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - int loops = 0; - - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cur_trans->dirty_bgs)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - return 0; - } - list_splice_init(&cur_trans->dirty_bgs, &dirty); - spin_unlock(&cur_trans->dirty_bgs_lock); - -again: - /* - * make sure all the block groups on our dirty list actually - * exist - */ - btrfs_create_pending_block_groups(trans); - - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } - - /* - * cache_write_mutex is here only to save us from balance or automatic - * removal of empty block groups deleting this block group while we are - * writing out the cache - */ - mutex_lock(&trans->transaction->cache_write_mutex); - while (!list_empty(&dirty)) { - bool drop_reserve = true; - - cache = list_first_entry(&dirty, - struct btrfs_block_group_cache, - dirty_list); - /* - * this can happen if something re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - } - - - /* - * btrfs_wait_cache_io uses the cache->dirty_list to decide - * if it should update the cache_state. Don't delete - * until after we wait. - * - * Since we're not running in the commit critical section - * we need the dirty_bgs_lock to protect from update_block_group - */ - spin_lock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - - should_put = 1; - - cache_save_setup(cache, trans, path); - - if (cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - - /* - * The cache_write_mutex is protecting the - * io_list, also refer to the definition of - * btrfs_transaction::io_bgs for more details - */ - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; - } - } - if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); - /* - * Our block group might still be attached to the list - * of new block groups in the transaction handle of some - * other task (struct btrfs_trans_handle->new_bgs). This - * means its block group item isn't yet in the extent - * tree. If this happens ignore the error, as we will - * try again later in the critical section of the - * transaction commit. - */ - if (ret == -ENOENT) { - ret = 0; - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &cur_trans->dirty_bgs); - btrfs_get_block_group(cache); - drop_reserve = false; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret) { - btrfs_abort_transaction(trans, ret); - } - } - - /* if it's not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - - /* - * Avoid blocking other tasks for too long. It might even save - * us from writing caches for block groups that are going to be - * removed. - */ - mutex_unlock(&trans->transaction->cache_write_mutex); - mutex_lock(&trans->transaction->cache_write_mutex); - } - mutex_unlock(&trans->transaction->cache_write_mutex); - - /* - * go through delayed refs for all the stuff we've just kicked off - * and then loop back (just once) - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (!ret && loops == 0) { - loops++; - spin_lock(&cur_trans->dirty_bgs_lock); - list_splice_init(&cur_trans->dirty_bgs, &dirty); - /* - * dirty_bgs_lock protects us from concurrent block group - * deletes too (not just cache_write_mutex). - */ - if (!list_empty(&dirty)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - goto again; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { - btrfs_cleanup_dirty_bgs(cur_trans, fs_info); - } - - btrfs_free_path(path); - return ret; -} - -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path; - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * Even though we are in the critical section of the transaction commit, - * we can still have concurrent tasks adding elements to this - * transaction's list of dirty block groups. These tasks correspond to - * endio free space workers started when writeback finishes for a - * space cache, which run inode.c:btrfs_finish_ordered_io(), and can - * allocate new block groups as a result of COWing nodes of the root - * tree when updating the free space inode. The writeback for the space - * caches is triggered by an earlier call to - * btrfs_start_dirty_block_groups() and iterations of the following - * loop. - * Also we want to do the cache_save_setup first and then run the - * delayed refs to make sure we have the best chance at doing this all - * in one shot. - */ - spin_lock(&cur_trans->dirty_bgs_lock); - while (!list_empty(&cur_trans->dirty_bgs)) { - cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, - dirty_list); - - /* - * this can happen if cache_save_setup re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - spin_lock(&cur_trans->dirty_bgs_lock); - } - - /* - * don't remove from the dirty list until after we've waited - * on any pending IO - */ - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - should_put = 1; - - cache_save_setup(cache, trans, path); - - if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); - - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; - } - } - if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); - /* - * One of the free space endio workers might have - * created a new block group while updating a free space - * cache's inode (at inode.c:btrfs_finish_ordered_io()) - * and hasn't released its transaction handle yet, in - * which case the new block group is still attached to - * its transaction handle and its creation has not - * finished yet (no block group item in the extent tree - * yet, etc). If this is the case, wait for all free - * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and - * complex approach. - */ - if (ret == -ENOENT) { - wait_event(cur_trans->writer_wait, - atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, fs_info, - path, cache); - } - if (ret) - btrfs_abort_transaction(trans, ret); - } - - /* if its not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); - spin_lock(&cur_trans->dirty_bgs_lock); - } - spin_unlock(&cur_trans->dirty_bgs_lock); - - /* - * Refer to the definition of io_bgs member for details why it's safe - * to use it without any locking - */ - while (!list_empty(io)) { - cache = list_first_entry(io, struct btrfs_block_group_cache, - io_list); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - } - - btrfs_free_path(path); - return ret; -} - int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; int readonly = 0; block_group = btrfs_lookup_block_group(fs_info, bytenr); @@ -3886,254 +2548,6 @@ int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) return readonly; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - bool ret = true; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg) - return false; - - spin_lock(&bg->lock); - if (bg->ro) - ret = false; - else - atomic_inc(&bg->nocow_writers); - spin_unlock(&bg->lock); - - /* no put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) - btrfs_put_block_group(bg); - - return ret; - -} - -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); - if (atomic_dec_and_test(&bg->nocow_writers)) - wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); - btrfs_put_block_group(bg); -} - -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) -{ - wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); -} - -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - -static int create_space_info(struct btrfs_fs_info *info, u64 flags) -{ - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&space_info->block_groups[i]); - init_rwsem(&space_info->groups_sem); - spin_lock_init(&space_info->lock); - space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - init_waitqueue_head(&space_info->wait); - INIT_LIST_HEAD(&space_info->ro_bgs); - INIT_LIST_HEAD(&space_info->tickets); - INIT_LIST_HEAD(&space_info->priority_tickets); - - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - percpu_counter_destroy(&space_info->total_bytes_pinned); - kfree(space_info); - return ret; - } - - list_add_rcu(&space_info->list, &info->space_info); - if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = space_info; - - return ret; -} - -static void update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) -{ - struct btrfs_space_info *found; - int factor; - - factor = btrfs_bg_type_to_factor(flags); - - found = __find_space_info(info, flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; -} - -static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} - -/* - * returns target flags in extended format or 0 if restripe for this - * chunk_type is not in progress - * - * should be called with balance_lock held - */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - u64 target = 0; - - if (!bctl) - return 0; - - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - return target; -} - -/* - * @flags: available profiles in extended format (see ctree.h) - * - * Returns reduced profile in chunk format. If profile changing is in - * progress (either running or paused) picks the target profile (if it's - * already available), otherwise falls back to plain reducing. - */ -static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices = fs_info->fs_devices->rw_devices; - u64 target; - u64 raid_type; - u64 allowed = 0; - - /* - * see if restripe for this chunk_type is in progress, if so - * try to reduce to the target profile - */ - spin_lock(&fs_info->balance_lock); - target = get_restripe_target(fs_info, flags); - if (target) { - /* pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&fs_info->balance_lock); - return extended_to_chunk(target); - } - } - spin_unlock(&fs_info->balance_lock); - - /* First, mask out the RAID levels which aren't possible */ - for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { - if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_array[raid_type].bg_flag; - } - allowed &= flags; - - if (allowed & BTRFS_BLOCK_GROUP_RAID6) - allowed = BTRFS_BLOCK_GROUP_RAID6; - else if (allowed & BTRFS_BLOCK_GROUP_RAID5) - allowed = BTRFS_BLOCK_GROUP_RAID5; - else if (allowed & BTRFS_BLOCK_GROUP_RAID10) - allowed = BTRFS_BLOCK_GROUP_RAID10; - else if (allowed & BTRFS_BLOCK_GROUP_RAID1) - allowed = BTRFS_BLOCK_GROUP_RAID1; - else if (allowed & BTRFS_BLOCK_GROUP_RAID0) - allowed = BTRFS_BLOCK_GROUP_RAID0; - - flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - return extended_to_chunk(flags | allowed); -} - -static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) -{ - unsigned seq; - u64 flags; - - do { - flags = orig_flags; - seq = read_seqbegin(&fs_info->profiles_lock); - - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&fs_info->profiles_lock, seq)); - - return btrfs_reduce_alloc_profile(fs_info, flags); -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4147,2331 +2561,13 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - ret = get_alloc_profile(fs_info, flags); - return ret; -} - -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); -} - -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); -} - -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); -} - -static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - (may_use_included ? s_info->bytes_may_use : 0); -} - -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; - - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, fs_info->sectorsize); - - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = do_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - update_bytes_may_use(data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); - spin_unlock(&data_sinfo->lock); - - return 0; -} - -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - - /* align the range */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); - if (ret < 0) - return ret; - - /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); - else - ret = 0; - return ret; -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will *NOT* use accurate qgroup reserved space API, just for case - * which we can't sleep and is sure it won't affect qgroup reserved space. - * Like clear_bit_hook(). - */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_space_info *data_sinfo; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - update_bytes_may_use(data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); - spin_unlock(&data_sinfo->lock); -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will handle the per-inode data rsv map for accurate reserved - * space framework. - */ -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); - - btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, reserved, start, len); -} - -static void force_metadata_allocation(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = CHUNK_ALLOC_FORCE; - } - rcu_read_unlock(); -} - -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) -{ - u64 bytes_used = btrfs_space_info_used(sinfo, false); - u64 thresh; - - if (force == CHUNK_ALLOC_FORCE) - return 1; - - /* - * in limited mode, we want to have some free space up to - * about 1% of the FS size. - */ - if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); - - if (sinfo->total_bytes - bytes_used < thresh) - return 1; - } - - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) - return 0; - return 1; -} - -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) -{ - u64 num_dev; - - if (type & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) - num_dev = fs_info->fs_devices->rw_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_dev = 2; - else - num_dev = 1; /* DUP or single */ - - return num_dev; -} - -/* - * If @is_allocation is true, reserve space in the system space info necessary - * for allocating a chunk, otherwise if it's false, reserve space necessary for - * removing a chunk. - */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *info; - u64 left; - u64 thresh; - int ret = 0; - u64 num_devs; - - /* - * Needed because we can end up allocating a system chunk and for an - * atomic and race free space reservation in the chunk block reserve. - */ - lockdep_assert_held(&fs_info->chunk_mutex); - - info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - spin_lock(&info->lock); - left = info->total_bytes - btrfs_space_info_used(info, true); - spin_unlock(&info->lock); - - num_devs = get_profile_num_devs(fs_info, type); - - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + - btrfs_calc_trans_metadata_size(fs_info, 1); - - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); - dump_space_info(fs_info, info, 0, 0); - } - - if (left < thresh) { - u64 flags = btrfs_system_alloc_profile(fs_info); - - /* - * Ignore failure to create system chunk. We might end up not - * needing it, as we might not need to COW all nodes/leafs from - * the paths we visit in the chunk tree (they were already COWed - * or created in the current transaction for example). - */ - ret = btrfs_alloc_chunk(trans, flags); - } - - if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, - &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) - trans->chunk_bytes_reserved += thresh; - } -} - -/* - * If force is CHUNK_ALLOC_FORCE: - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - * If force is NOT CHUNK_ALLOC_FORCE: - * - return 0 if it doesn't need to allocate a new chunk, - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - */ -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; - bool wait_for_alloc = false; - bool should_alloc = false; - int ret = 0; - - /* Don't re-enter if we're already allocating a chunk */ - if (trans->allocating_chunk) - return -ENOSPC; - - space_info = __find_space_info(fs_info, flags); - ASSERT(space_info); - - do { - spin_lock(&space_info->lock); - if (force < space_info->force_alloc) - force = space_info->force_alloc; - should_alloc = should_alloc_chunk(fs_info, space_info, force); - if (space_info->full) { - /* No more free physical space */ - if (should_alloc) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&space_info->lock); - return ret; - } else if (!should_alloc) { - spin_unlock(&space_info->lock); - return 0; - } else if (space_info->chunk_alloc) { - /* - * Someone is already allocating, so we need to block - * until this someone is finished and then loop to - * recheck if we should continue with our allocation - * attempt. - */ - wait_for_alloc = true; - spin_unlock(&space_info->lock); - mutex_lock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->chunk_mutex); - } else { - /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; - spin_unlock(&space_info->lock); - } - - cond_resched(); - } while (wait_for_alloc); - - mutex_lock(&fs_info->chunk_mutex); - trans->allocating_chunk = true; - - /* - * If we have mixed data/metadata chunks we want to make sure we keep - * allocating mixed chunks instead of individual chunks. - */ - if (btrfs_mixed_space_info(space_info)) - flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); - - /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. - */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); - } - - /* - * Check if we have enough space in SYSTEM chunk because we may need - * to update devices. - */ - check_system_chunk(trans, flags); - - ret = btrfs_alloc_chunk(trans, flags); - trans->allocating_chunk = false; - - spin_lock(&space_info->lock); - if (ret < 0) { - if (ret == -ENOSPC) - space_info->full = 1; - else - goto out; - } else { - ret = 1; - space_info->max_extent_size = 0; - } - - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; -out: - space_info->chunk_alloc = 0; - spin_unlock(&space_info->lock); - mutex_unlock(&fs_info->chunk_mutex); - /* - * When we allocate a new chunk we reserve space in the chunk block - * reserve to make sure we can COW nodes/leafs in the chunk tree or - * add new nodes/leafs to it if we end up needing to do it when - * inserting the chunk item and updating device items as part of the - * second phase of chunk allocation, performed by - * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a - * large number of new block groups to create in our transaction - * handle's new_bgs list to avoid exhausting the chunk block reserve - * in extreme cases - like having a single transaction create many new - * block groups when starting to write out the free space caches of all - * the block groups that were made dirty during the lifetime of the - * transaction. - */ - if (trans->chunk_bytes_reserved >= (u64)SZ_2M) - btrfs_create_pending_block_groups(trans); - - return ret; -} - -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 profile; - u64 space_size; - u64 avail; - u64 used; - int factor; - - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - - if (system_chunk) - profile = btrfs_system_alloc_profile(fs_info); - else - profile = btrfs_metadata_alloc_profile(fs_info); - - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - - avail = atomic64_read(&fs_info->free_chunk_space); - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually usable. For raid56, the space info used - * doesn't include the parity drive, so we don't have to - * change the math - */ - factor = btrfs_bg_type_to_factor(profile); - avail = div_u64(avail, factor); - - /* - * If we aren't flushing all things, let us overcommit up to - * 1/2th of the space. If we can flush, don't let us overcommit - * too much, let it overcommit up to 1/8 of the space. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; - else - avail >>= 1; - - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; -} - -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - u64 bytes; - u64 nr; - - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - -/* - * shrink metadata reservation for delalloc - */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) -{ - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - u64 delalloc_bytes; - u64 async_pages; - u64 items; - long time_left; - unsigned long nr_pages; - int loops; - - /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; - - trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - if (delalloc_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - return; - } - - loops = 0; - while (delalloc_bytes && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - } -} - -struct reserve_ticket { - u64 orig_bytes; - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - -/** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_trans_handle *trans; - u64 bytes_needed; - u64 reclaim_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reservation for - * this reservation. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - -/* - * Try to flush some data based on policy set by @state. This is only advisory - * and may fail for various reasons. The caller is supposed to examine the - * state of @space_info to detect the outcome. - */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - int state) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_trans_handle *trans; - int nr; - int ret = 0; - - switch (state) { - case FLUSH_DELAYED_ITEMS_NR: - case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; - else - nr = -1; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = btrfs_run_delayed_items_nr(trans, nr); - btrfs_end_transaction(trans); - break; - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, - state == FLUSH_DELALLOC_WAIT); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); - else - nr = 0; - btrfs_run_delayed_refs(trans, nr); - btrfs_end_transaction(trans); - break; - case ALLOC_CHUNK: - case ALLOC_CHUNK_FORCE: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = do_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), - (state == ALLOC_CHUNK) ? - CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); - btrfs_end_transaction(trans); - if (ret > 0 || ret == -ENOSPC) - ret = 0; - break; - case COMMIT_TRANS: - /* - * If we have pending delayed iputs then we could free up a - * bunch of pinned space, so make sure we run the iputs before - * we do our pinned bytes check below. - */ - btrfs_run_delayed_iputs(fs_info); - btrfs_wait_on_delayed_iputs(fs_info); - - ret = may_commit_transaction(fs_info, space_info); - break; - default: - ret = -ENOSPC; - break; - } - - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); - return; -} - -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) -{ - struct reserve_ticket *ticket; - u64 used; - u64 expected; - u64 to_reclaim = 0; - - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - return to_reclaim; -} - -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used, bool system_chunk) -{ - u64 thresh = div_factor_fine(space_info->total_bytes, 98); - - /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; - - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) - return 0; - - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); -} - -static bool wake_all_tickets(struct list_head *head) -{ - struct reserve_ticket *ticket; - - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); - list_del_init(&ticket->list); - ticket->error = -ENOSPC; - wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; - } - return false; -} - -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) -{ - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; - u64 to_reclaim; - int flush_state; - int commit_cycles = 0; - u64 last_tickets_id; - - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - last_tickets_id = space_info->tickets_id; - spin_unlock(&space_info->lock); - - flush_state = FLUSH_DELAYED_ITEMS_NR; - do { - flush_space(fs_info, space_info, to_reclaim, flush_state); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets)) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); - if (last_tickets_id == space_info->tickets_id) { - flush_state++; - } else { - last_tickets_id = space_info->tickets_id; - flush_state = FLUSH_DELAYED_ITEMS_NR; - if (commit_cycles) - commit_cycles--; - } - - /* - * We don't want to force a chunk allocation until we've tried - * pretty hard to reclaim space. Think of the case where we - * freed up a bunch of space and so have a lot of pinned space - * to reclaim. We would rather use that than possibly create a - * underutilized metadata chunk. So if this is our first run - * through the flushing state machine skip ALLOC_CHUNK_FORCE and - * commit the transaction. If nothing has changed the next go - * around then we can force a chunk allocation. - */ - if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) - flush_state++; - - if (flush_state > COMMIT_TRANS) { - commit_cycles++; - if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { - flush_state = FLUSH_DELAYED_ITEMS_NR; - commit_cycles--; - } else { - space_info->flush = 0; - } - } else { - flush_state = FLUSH_DELAYED_ITEMS_NR; - } - } - spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); -} - -void btrfs_init_async_reclaim_work(struct work_struct *work) -{ - INIT_WORK(work, btrfs_async_reclaim_metadata_space); -} - -static const enum btrfs_flush_state priority_flush_states[] = { - FLUSH_DELAYED_ITEMS_NR, - FLUSH_DELAYED_ITEMS, - ALLOC_CHUNK, -}; - -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) -{ - u64 to_reclaim; - int flush_state; - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - - flush_state = 0; - do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); -} - -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) - -{ - DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; - int ret = 0; - - spin_lock(&space_info->lock); - while (ticket->bytes > 0 && ticket->error == 0) { - ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); - if (ret) { - ret = -EINTR; - break; - } - spin_unlock(&space_info->lock); - - schedule(); - - finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); - } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct reserve_ticket ticket; - u64 used; - u64 reclaim_bytes = 0; - int ret = 0; - - ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); - - spin_lock(&space_info->lock); - ret = -ENOSPC; - used = btrfs_space_info_used(space_info, true); - - /* - * If we have enough space then hooray, make our reservation and carry - * on. If not see if we can overcommit, and if we can, hooray carry on. - * If not things get more complicated. - */ - if (used + orig_bytes <= space_info->total_bytes) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } - - /* - * If we couldn't make a reservation then setup our reservation ticket - * and kick the async worker if it's not already running. - * - * If we are a priority flusher then we just need to add our ticket to - * the list and we will do our own flushing further down. - */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; - ticket.bytes = orig_bytes; - ticket.error = 0; - init_waitqueue_head(&ticket.wait); - if (flush == BTRFS_RESERVE_FLUSH_ALL) { - list_add_tail(&ticket.list, &space_info->tickets); - if (!space_info->flush) { - space_info->flush = 1; - trace_btrfs_trigger_flush(fs_info, - space_info->flags, - orig_bytes, flush, - "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } else { - list_add_tail(&ticket.list, - &space_info->priority_tickets); - } - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; - /* - * We will do the space reservation dance during log replay, - * which means we won't have fs_info->fs_root set, so don't do - * the async reclaim as we will panic. - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && - !work_busy(&fs_info->async_reclaim_work)) { - trace_btrfs_trigger_flush(fs_info, space_info->flags, - orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } - spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - return ret; - - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool system_chunk = (root == fs_info->chunk_root); - - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } - if (ret == -ENOSPC) { - trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); - } - return ret; -} - -static struct btrfs_block_rsv *get_block_rsv( - const struct btrfs_trans_handle *trans, - const struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = NULL; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) - block_rsv = trans->block_rsv; - - if (!block_rsv) - block_rsv = root->block_rsv; - - if (!block_rsv) - block_rsv = &fs_info->empty_block_rsv; - - return block_rsv; -} - -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { - block_rsv->reserved -= num_bytes; - if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; - ret = 0; - } - spin_unlock(&block_rsv->lock); - return ret; -} - -static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, bool update_size) -{ - spin_lock(&block_rsv->lock); - block_rsv->reserved += num_bytes; - if (update_size) - block_rsv->size += num_bytes; - else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; - spin_unlock(&block_rsv->lock); -} - -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; - spin_unlock(&global_rsv->lock); - - block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - -/** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. - * - * This transfers up to the num_bytes amount from the src rsv to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. - */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; - - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } - - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; - spin_unlock(&delayed_refs_rsv->lock); - - if (num_bytes) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, - to_free); -} - -/** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. - * - * This will refill the delayed block_rsv up to 1 items size worth of space and - * will return -ENOSPC if we can't make the reservation. - */ -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 num_bytes = 0; - int ret = -ENOSPC; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - num_bytes = block_rsv->size - block_rsv->reserved; - num_bytes = min(num_bytes, limit); - } - spin_unlock(&block_rsv->lock); - - if (!num_bytes) - return 0; - - ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); - if (ret) - return ret; - block_rsv_add_bytes(block_rsv, num_bytes, 0); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - return 0; -} - -/* - * This is for space we already have accounted in space_info->bytes_may_use, so - * basically when we're returning space from block_rsv's. - */ -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head; - u64 used; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; - - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} - -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; - -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - update_bytes_may_use(space_info, ticket->bytes); - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - update_bytes_may_use(space_info, num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - goto again; - } -} - -static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes, - u64 *qgroup_to_release_ret) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 qgroup_to_release = 0; - u64 ret; - - spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) { - num_bytes = block_rsv->size; - qgroup_to_release = block_rsv->qgroup_rsv_size; - } - block_rsv->size -= num_bytes; - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } else { - num_bytes = 0; - } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { - qgroup_to_release = block_rsv->qgroup_rsv_reserved - - block_rsv->qgroup_rsv_size; - block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; - } else { - qgroup_to_release = 0; - } - spin_unlock(&block_rsv->lock); - - ret = num_bytes; - if (num_bytes > 0) { - if (dest) { - spin_lock(&dest->lock); - if (!dest->full) { - u64 bytes_to_add; - - bytes_to_add = dest->size - dest->reserved; - bytes_to_add = min(num_bytes, bytes_to_add); - dest->reserved += bytes_to_add; - if (dest->reserved >= dest->size) - dest->full = 1; - num_bytes -= bytes_to_add; - } - spin_unlock(&dest->lock); - } - if (num_bytes) - space_info_add_old_bytes(fs_info, space_info, - num_bytes); - } - if (qgroup_to_release_ret) - *qgroup_to_release_ret = qgroup_to_release; - return ret; -} - -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes, - bool update_size) -{ - int ret; - - ret = block_rsv_use_bytes(src, num_bytes); - if (ret) - return ret; - - block_rsv_add_bytes(dst, num_bytes, update_size); - return 0; -} - -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) -{ - memset(rsv, 0, sizeof(*rsv)); - spin_lock_init(&rsv->lock); - rsv->type = type; -} - -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type) -{ - btrfs_init_block_rsv(rsv, type); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); -} - -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) -{ - struct btrfs_block_rsv *block_rsv; - - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); - if (!block_rsv) - return NULL; - - btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); - return block_rsv; -} - -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - if (!rsv) - return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); - kfree(rsv); -} - -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush) -{ - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) - block_rsv_add_bytes(block_rsv, num_bytes, true); - - return ret; -} - -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); - if (block_rsv->reserved >= num_bytes) - ret = 0; - spin_unlock(&block_rsv->lock); - - return ret; -} - -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) - ret = 0; - else - num_bytes -= block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (!ret) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - return 0; - } - - return ret; -} - -static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, - u64 *metadata_bytes, u64 *qgroup_bytes) -{ - *metadata_bytes = 0; - *qgroup_bytes = 0; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - *metadata_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - *qgroup_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); -} - -/** - * btrfs_inode_rsv_refill - refill the inode block rsv. - * @inode - the inode we are refilling. - * @flush - the flushing restriction. - * - * Essentially the same as btrfs_block_rsv_refill, except it uses the - * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the reserve - * tracepoint for the reserved amount. - */ -static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes, last = 0; - u64 qgroup_num_bytes; - int ret = -ENOSPC; - - calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - - do { - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, - true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (ret) { - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - last = num_bytes; - /* - * If we are fragmented we can end up with a lot of - * outstanding extents which will make our size be much - * larger than our reserved amount. - * - * If the reservation happens here, it might be very - * big though not needed in the end, if the delalloc - * flushing happens. - * - * If this is the case try and do the reserve again. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - calc_refill_bytes(block_rsv, &num_bytes, - &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - } - } while (ret && last != num_bytes); - - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 1); - - /* Don't forget to increase qgroup_rsv_reserved */ - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; - spin_unlock(&block_rsv->lock); - } - return ret; -} - -static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *target = delayed_rsv; - - if (target->full || target == block_rsv) - target = global_rsv; - - if (block_rsv->space_info != target->space_info) - target = NULL; - - return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, - qgroup_to_release); -} - -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - -/** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. - * - * This is the same as btrfs_block_rsv_release, except that it handles the - * tracepoint for the reservation. - */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 released = 0; - u64 qgroup_to_release = 0; - - /* - * Since we statically set the block_rsv->size we just want to say we - * are releasing 0 bytes, and then we'll just get the reservation over - * the size free'd. - */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); - if (released > 0) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), released, 0); - if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); - else - btrfs_qgroup_convert_reserved_meta(inode->root, - qgroup_to_release); -} - -/** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. - * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. - */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); - u64 released = 0; - - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, - num_bytes, NULL); - if (released) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, released, 0); -} - -static void update_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; - struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - - /* - * The global block rsv is based on the size of the extent tree, the - * checksum tree and the root tree. If the fs is empty we want to set - * it to a minimal amount for safety. - */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); - - spin_lock(&sinfo->lock); - spin_lock(&block_rsv->lock); - - block_rsv->size = min_t(u64, num_bytes, SZ_512M); - - if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - update_bytes_may_use(sinfo, num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } - } else if (block_rsv->reserved > block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - update_bytes_may_use(sinfo, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); - block_rsv->reserved = block_rsv->size; - } - - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; - - spin_unlock(&block_rsv->lock); - spin_unlock(&sinfo->lock); -} - -static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - fs_info->chunk_block_rsv.space_info = space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - fs_info->global_block_rsv.space_info = space_info; - fs_info->trans_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.space_info = space_info; - fs_info->delayed_block_rsv.space_info = space_info; - fs_info->delayed_refs_rsv.space_info = space_info; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - - update_global_block_rsv(fs_info); -} - -static void release_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1, NULL); - WARN_ON(fs_info->trans_block_rsv.size > 0); - WARN_ON(fs_info->trans_block_rsv.reserved > 0); - WARN_ON(fs_info->chunk_block_rsv.size > 0); - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_block_rsv.size > 0); - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.size > 0); -} - -/* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs - * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. - */ -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes; - - if (!trans->delayed_ref_updates) - return; - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); - spin_lock(&delayed_rsv->lock); - delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; - spin_unlock(&delayed_rsv->lock); - trans->delayed_ref_updates = 0; -} - -/* - * To be called after all the new block groups attached to the transaction - * handle have been created (btrfs_create_pending_block_groups()). - */ -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (!trans->chunk_bytes_reserved) - return; - - WARN_ON_ONCE(!list_empty(&trans->new_bgs)); - - block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved, NULL); - trans->chunk_bytes_reserved = 0; -} - -/* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation - * root: the root of the parent directory - * rsv: block reservation - * items: the number of items that we need do reservation - * use_global_rsv: allow fallback to the global block reservation - * - * This function is used to reserve the space for snapshot/subvolume - * creation and deletion. Those operations are different with the - * common file/directory operations, they change two fs/file trees - * and root tree, the number of items that the qgroup reserves is - * different with the free space reservation. So we can not use - * the space reservation mechanism in start_transaction(). - */ -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, int items, - bool use_global_rsv) -{ - u64 qgroup_num_bytes = 0; - u64 num_bytes; - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - /* One for parent inode, two for dir entries */ - qgroup_num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); - if (ret) - return ret; - } - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); - - if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); - - if (ret && qgroup_num_bytes) - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - - return ret; -} - -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); -} - -static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, - struct btrfs_inode *inode) -{ - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 reserve_size = 0; - u64 qgroup_rsv_size = 0; - u64 csum_leaves; - unsigned outstanding_extents; - - lockdep_assert_held(&inode->lock); - outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); - /* - * For qgroup rsv, the calculation is very simple: - * account one nodesize for each outstanding extent - * - * This is overestimating in most cases. - */ - qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; - - spin_lock(&block_rsv->lock); - block_rsv->size = reserve_size; - block_rsv->qgroup_rsv_size = qgroup_rsv_size; - spin_unlock(&block_rsv->lock); -} - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned nr_extents; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret = 0; - bool delalloc_lock = true; - - /* If we are a free space inode we need to not flush since we will be in - * the middle of a transaction commit. We also don't need the delalloc - * mutex since we won't race with anybody. We need this mostly to make - * lockdep shut its filthy mouth. - * - * If we have a transaction open (can happen if we call truncate_block - * from truncate), then we need FLUSH_LIMIT so we don't deadlock. - */ - if (btrfs_is_free_space_inode(inode)) { - flush = BTRFS_RESERVE_NO_FLUSH; - delalloc_lock = false; - } else { - if (current->journal_info) - flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - } - - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - - /* Add our new extents and calculate the new rsv size. */ - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - ret = btrfs_inode_rsv_refill(inode, flush); - if (unlikely(ret)) - goto out_fail; - - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return 0; - -out_fail: - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -nr_extents); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - btrfs_inode_rsv_release(inode, true); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return ret; -} - -/** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation - * - * This will release the metadata reservation for an inode. This can be called - * once we complete IO for a given set of bytes to release their metadata - * reservations, or on error for the same reason. - */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. - * - * When we reserve space we increase outstanding_extents for the extents we may - * add. Once we've set the range as delalloc or created our ordered extents we - * have outstanding_extents to track the real usage, so we use this to free our - * temporarily tracked outstanding_extents. This _must_ be used in conjunction - * with btrfs_delalloc_reserve_metadata. - */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned num_extents; - - spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -num_extents); - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. - * - * This will do the following things - * - * o reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space - * (Done in check_data_free_space) - * - * o reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. - * o add to the inodes->delalloc_bytes - * o add it to the fs_info's delalloc inodes list. - * (Above 3 all done in delalloc_reserve_metadata) - * - * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) - */ -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - int ret; - - ret = btrfs_check_data_free_space(inode, reserved, start, len); - if (ret < 0) - return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); - if (ret < 0) - btrfs_free_reserved_data_space(inode, *reserved, start, len); - return ret; -} - -/** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use - * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. - */ -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free) -{ - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(inode, reserved, start, len); -} - -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *info, u64 bytenr, - u64 num_bytes, int alloc) -{ - struct btrfs_block_group_cache *cache = NULL; - u64 total = num_bytes; - u64 old_val; - u64 byte_in_group; - int factor; - int ret = 0; - - /* block accounting for super block */ - spin_lock(&info->delalloc_root_lock); - old_val = btrfs_super_bytes_used(info->super_copy); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_root_lock); - - while (total) { - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - factor = btrfs_bg_type_to_factor(cache->flags); - - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, 1); - - byte_in_group = bytenr - cache->key.objectid; - WARN_ON(byte_in_group > cache->key.offset); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; - - old_val = btrfs_block_group_used(&cache->item); - num_bytes = min(total, cache->key.offset - byte_in_group); - if (alloc) { - old_val += num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } else { - old_val -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - trace_btrfs_space_reservation(info, "pinned", - cache->space_info->flags, - num_bytes, 1); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); - } - - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->transaction->num_dirty_bgs++; - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) - btrfs_mark_bg_unused(cache); - - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; - } - - /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); + ret = btrfs_get_alloc_profile(fs_info, flags); return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 bytenr; spin_lock(&fs_info->block_group_cache_lock); @@ -6485,20 +2581,22 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) if (!cache) return 0; - bytenr = cache->key.objectid; + bytenr = cache->start; btrfs_put_block_group(cache); return bytenr; } -static int pin_down_extent(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, +static int pin_down_extent(struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { + struct btrfs_fs_info *fs_info = cache->fs_info; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6506,8 +2604,6 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - trace_btrfs_space_reservation(fs_info, "pinned", - cache->space_info->flags, num_bytes, 1); percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); set_extent_dirty(fs_info->pinned_extents, bytenr, @@ -6515,18 +2611,17 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; + + ASSERT(fs_info->running_transaction); cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); + pin_down_extent(cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -6538,7 +2633,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int ret; cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -6551,9 +2646,9 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, 1); + btrfs_cache_block_group(cache, 1); - pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); + pin_down_extent(cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -6565,25 +2660,26 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { int ret; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_caching_control *caching_ctl; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); + btrfs_cache_block_group(block_group, 0); + caching_ctl = btrfs_get_caching_control(block_group); if (!caching_ctl) { /* Logic error */ - BUG_ON(!block_group_cache_done(block_group)); + BUG_ON(!btrfs_block_group_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { - ret = add_excluded_extent(fs_info, start, num_bytes); + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); @@ -6597,19 +2693,20 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, num_bytes = (start + num_bytes) - caching_ctl->progress; start = caching_ctl->progress; - ret = add_excluded_extent(fs_info, start, num_bytes); + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); } out_lock: mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); } btrfs_put_block_group(block_group); return ret; } -int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +int btrfs_exclude_logged_extents(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_file_extent_item *item; struct btrfs_key key; int found_type; @@ -6640,127 +2737,26 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, } static void -btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) { atomic_inc(&bg->reservations); } -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start) -{ - struct btrfs_block_group_cache *bg; - - bg = btrfs_lookup_block_group(fs_info, start); - ASSERT(bg); - if (atomic_dec_and_test(&bg->reservations)) - wake_up_var(&bg->reservations); - btrfs_put_block_group(bg); -} - -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) -{ - struct btrfs_space_info *space_info = bg->space_info; - - ASSERT(bg->ro); - - if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) - return; - - /* - * Our block group is read only but before we set it to read only, - * some task might have had allocated an extent from it already, but it - * has not yet created a respective ordered extent (and added it to a - * root's list of ordered extents). - * Therefore wait for any task currently allocating extents, since the - * block group's reservations counter is incremented while a read lock - * on the groups' semaphore is held and decremented after releasing - * the read access on that semaphore and creating the ordered extent. - */ - down_write(&space_info->groups_sem); - up_write(&space_info->groups_sem); - - wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); -} - -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @ram_bytes: The number of bytes of file content, and will be same to - * @num_bytes except for the compress path. - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by the allocator when it reserves space. If this is a - * reservation and the block group has become read only we cannot make the - * reservation and return -EAGAIN, otherwise this function always succeeds. - */ -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) -{ - struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - update_bytes_may_use(space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - return ret; -} - -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by somebody who is freeing space that was never actually used - * on disk. For example if you reserve some space for a new leaf in transaction - * A and before transaction A commits you free that leaf, you call this with - * reserve set to 0 in order to clear the reservation. - */ - -static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) -{ - struct btrfs_space_info *space_info = cache->space_info; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->max_extent_size = 0; - - if (delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); -} void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { cache = caching_ctl->block_group; - if (block_group_cache_done(cache)) { + if (btrfs_block_group_done(cache)) { cache->last_byte_to_unpin = (u64)-1; list_del_init(&caching_ctl->list); - put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); } else { cache->last_byte_to_unpin = caching_ctl->progress; } @@ -6773,7 +2769,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) up_write(&fs_info->commit_root_sem); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); } /* @@ -6809,7 +2805,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end, const bool return_free_space) { - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; @@ -6821,7 +2817,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, while (start <= end) { readonly = false; if (!cache || - start >= cache->key.objectid + cache->key.offset) { + start >= cache->start + cache->length) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; @@ -6834,7 +2830,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, empty_cluster <<= 1; } - len = cache->key.objectid + cache->key.offset - start; + len = cache->start + cache->length - start; len = min(len, end + 1 - start); if (start < cache->last_byte_to_unpin) { @@ -6863,10 +2859,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - update_bytes_pinned(space_info, -len); - - trace_btrfs_space_reservation(fs_info, "pinned", - space_info->flags, len, 0); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; percpu_counter_add_batch(&space_info->total_bytes_pinned, -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); @@ -6884,20 +2877,17 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - update_bytes_may_use(space_info, to_add); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; - trace_btrfs_space_reservation(fs_info, - "space_info", - space_info->flags, - to_add, 1); len -= to_add; } spin_unlock(&global_rsv->lock); /* Add to any tickets we may have */ if (len) - space_info_add_new_bytes(fs_info, space_info, - len); + btrfs_try_granting_tickets(fs_info, + space_info); } spin_unlock(&space_info->lock); } @@ -6910,7 +2900,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; @@ -6956,8 +2946,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = -EROFS; if (!trans->aborted) ret = btrfs_discard_extent(fs_info, - block_group->key.objectid, - block_group->key.offset, + block_group->start, + block_group->length, &trimmed); list_del_init(&block_group->bg_list); @@ -7185,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info->csum_root, bytenr, + num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7198,7 +3189,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = update_block_group(trans, info, bytenr, num_bytes, 0); + ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7272,28 +3263,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ref generic_ref = { 0 }; int pin = 1; int ret; + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root->root_key.objectid); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(root, buf->start, buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), 0, - BTRFS_DROP_DELAYED_REF); - ret = btrfs_add_delayed_tree_ref(trans, buf->start, - buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); @@ -7305,8 +3295,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(fs_info, cache, buf->start, - buf->len, 1); + pin_down_extent(cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -7320,8 +3309,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, buf->len, true, - root->root_key.objectid); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7333,120 +3321,63 @@ out: } /* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, - root_objectid, owner, offset, - BTRFS_DROP_DELAYED_REF); - /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { - WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + if ((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; - } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, + } else if (ref->type == BTRFS_REF_METADATA) { + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, &old_ref_mod, &new_ref_mod); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_DROP_DELAYED_REF, + ret = btrfs_add_delayed_data_ref(trans, ref, 0, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + if (!((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + btrfs_ref_tree_mod(fs_info, ref); - add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, ref); return ret; } -/* - * when we wait for progress in the block group caching, its because - * our allocation attempt failed at least once. So, we must sleep - * and let some progress happen before we try again. - * - * This function will sleep at least once waiting for new free space to - * show up, and then it will check the block group free space numbers - * for our min num_bytes. Another option is to have it go ahead - * and look in the rbtree for a free extent of a given size, but this - * is a good start. - * - * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using - * any of the information in this block group. - */ -static noinline void -wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, - u64 num_bytes) -{ - struct btrfs_caching_control *caching_ctl; - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return; - - wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); - - put_caching_control(caching_ctl); -} - -static noinline int -wait_block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - - wait_event(caching_ctl->wait, block_group_cache_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; - put_caching_control(caching_ctl); - return ret; -} - enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, + LOOP_ALLOC_CHUNK, + LOOP_NO_EMPTY_SIZE, }; static inline void -btrfs_lock_block_group(struct btrfs_block_group_cache *cache, +btrfs_lock_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) down_read(&cache->data_rwsem); } -static inline void -btrfs_grab_block_group(struct btrfs_block_group_cache *cache, +static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, int delalloc) { btrfs_get_block_group(cache); @@ -7454,12 +3385,12 @@ btrfs_grab_block_group(struct btrfs_block_group_cache *cache, down_read(&cache->data_rwsem); } -static struct btrfs_block_group_cache * -btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, +static struct btrfs_block_group *btrfs_lock_cluster( + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) { - struct btrfs_block_group_cache *used_bg = NULL; + struct btrfs_block_group *used_bg = NULL; spin_lock(&cluster->refill_lock); while (1) { @@ -7493,7 +3424,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, } static inline void -btrfs_release_block_group(struct btrfs_block_group_cache *cache, +btrfs_release_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) @@ -7564,13 +3495,12 @@ struct find_free_extent_ctl { * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ -static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, +static int find_free_extent_clustered(struct btrfs_block_group *bg, struct btrfs_free_cluster *last_ptr, struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group_cache **cluster_bg_ret) + struct btrfs_block_group **cluster_bg_ret) { - struct btrfs_fs_info *fs_info = bg->fs_info; - struct btrfs_block_group_cache *cluster_bg; + struct btrfs_block_group *cluster_bg; u64 aligned_cluster; u64 offset; int ret; @@ -7583,7 +3513,7 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, - ffe_ctl->num_bytes, cluster_bg->key.objectid, + ffe_ctl->num_bytes, cluster_bg->start, &ffe_ctl->max_extent_size); if (offset) { /* We have a block, we're done */ @@ -7629,9 +3559,8 @@ refill_cluster: aligned_cluster = max_t(u64, ffe_ctl->empty_cluster + ffe_ctl->empty_size, bg->full_stripe_len); - ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, - ffe_ctl->search_start, ffe_ctl->num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, + ffe_ctl->num_bytes, aligned_cluster); if (ret == 0) { /* Now pull our allocation out of this cluster */ offset = btrfs_alloc_from_cluster(bg, last_ptr, @@ -7651,7 +3580,7 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); ffe_ctl->retry_clustered = true; - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size); return -EAGAIN; } @@ -7670,7 +3599,7 @@ refill_cluster: * Return 0 when we found an free extent and set ffe_ctrl->found_offset * Return -EAGAIN to inform caller that we need to re-search this block group */ -static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, +static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct btrfs_free_cluster *last_ptr, struct find_free_extent_ctl *ffe_ctl) { @@ -7718,8 +3647,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, */ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); ffe_ctl->retry_unclustered = true; return -EAGAIN; } else if (!offset) { @@ -7802,8 +3731,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = do_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); /* * If we can't allocate a new chunk we've already looped @@ -7871,8 +3800,9 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; + int cache_block_group_error = 0; struct btrfs_free_cluster *last_ptr = NULL; - struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool use_cluster = true; @@ -7899,7 +3829,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", flags); return -ENOSPC; @@ -7995,7 +3925,7 @@ search: continue; btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->start; /* * this can happen if we end up cycling through all the @@ -8004,9 +3934,8 @@ search: */ if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* @@ -8016,14 +3945,35 @@ search: */ if ((flags & extra) && !(block_group->flags & extra)) goto loop; + + /* + * This block group has different flags than we want. + * It's possible that we have MIXED_GROUP flag but no + * block group is mixed. Just skip such block group. + */ + btrfs_release_block_group(block_group, delalloc); + continue; } have_block_group: - ffe_ctl.cached = block_group_cache_done(block_group); + ffe_ctl.cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; - ret = cache_block_group(block_group, 0); - BUG_ON(ret < 0); + ret = btrfs_cache_block_group(block_group, 0); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } ret = 0; } @@ -8035,7 +3985,7 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *cluster_bg = NULL; + struct btrfs_block_group *cluster_bg = NULL; ret = find_free_extent_clustered(block_group, last_ptr, &ffe_ctl, &cluster_bg); @@ -8068,7 +4018,7 @@ checks: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { + block_group->start + block_group->length) { btrfs_add_free_space(block_group, ffe_ctl.found_offset, num_bytes); goto loop; @@ -8110,7 +4060,7 @@ loop: if (ret > 0) goto search; - if (ret == -ENOSPC) { + if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. @@ -8121,64 +4071,12 @@ loop: space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl.max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; } return ret; } -#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ -do { \ - struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ - spin_lock(&__rsv->lock); \ - btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ - __rsv->size, __rsv->reserved); \ - spin_unlock(&__rsv->lock); \ -} while (0) - -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) -{ - struct btrfs_block_group_cache *cache; - int index = 0; - - spin_lock(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", - info->flags, - info->total_bytes - btrfs_space_info_used(info, true), - info->full ? "" : "not "); - btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", - info->total_bytes, info->bytes_used, info->bytes_pinned, - info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); - spin_unlock(&info->lock); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - - if (!dump_block_groups) - return; - - down_read(&info->groups_sem); -again: - list_for_each_entry(cache, &info->block_groups[index], list) { - spin_lock(&cache->lock); - btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", - cache->key.objectid, cache->key.offset, - btrfs_block_group_used(&cache->item), cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); - spin_unlock(&cache->lock); - } - if (++index < BTRFS_NR_RAID_TYPES) - goto again; - up_read(&info->groups_sem); -} - /* * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a * hole that is at least as big as @num_bytes. @@ -8254,12 +4152,13 @@ again: } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(fs_info, flags); + sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu", flags, num_bytes); if (sinfo) - dump_space_info(fs_info, sinfo, num_bytes, 1); + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); } } @@ -8270,7 +4169,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int pin, int delalloc) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); @@ -8281,7 +4180,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } if (pin) - pin_down_extent(fs_info, cache, start, len, 1); + pin_down_extent(cache, start, len, 1); else { if (btrfs_test_opt(fs_info, DISCARD)) ret = btrfs_discard_extent(fs_info, start, len, NULL); @@ -8370,7 +4269,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -8460,8 +4359,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, extent_key.objectid, - fs_info->nodesize, 1); + ret = btrfs_update_block_group(trans, extent_key.objectid, + fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", extent_key.objectid, extent_key.offset); @@ -8478,19 +4377,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { + struct btrfs_ref generic_ref = { 0 }; int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, - root->root_key.objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT); - - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, - ins->offset, 0, - root->root_key.objectid, owner, - offset, ram_bytes, - BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins->objectid, ins->offset, 0); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_ref_tree_mod(root->fs_info, &generic_ref); + ret = btrfs_add_delayed_data_ref(trans, &generic_ref, + ram_bytes, NULL, NULL); return ret; } @@ -8505,7 +4402,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; /* @@ -8563,7 +4460,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(fs_info, buf); + btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking_write(buf); @@ -8599,73 +4496,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return buf; } -static struct btrfs_block_rsv * -use_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool global_updated = false; - - block_rsv = get_block_rsv(trans, root); - - if (unlikely(block_rsv->size == 0)) - goto try_reserve; -again: - ret = block_rsv_use_bytes(block_rsv, blocksize); - if (!ret) - return block_rsv; - - if (block_rsv->failfast) - return ERR_PTR(ret); - - if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { - global_updated = true; - update_global_block_rsv(fs_info); - goto again; - } - - /* - * The global reserve still exists to save us from ourselves, so don't - * warn_on if we are short on our delayed refs reserve. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && - btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); - } -try_reserve: - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - return block_rsv; - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve if its space type is the same as the global - * reservation. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && - block_rsv->space_info == global_rsv->space_info) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - return ERR_PTR(ret); -} - -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) -{ - block_rsv_add_bytes(block_rsv, blocksize, false); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); -} - /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. @@ -8682,6 +4512,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -8697,7 +4528,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, } #endif - block_rsv = use_block_rsv(trans, root, blocksize); + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -8736,13 +4567,12 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, - root_objectid, level, 0, - BTRFS_ADD_DELAYED_EXTENT); - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, - ins.offset, parent, - root_objectid, level, - BTRFS_ADD_DELAYED_EXTENT, + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins.objectid, ins.offset, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op, NULL, NULL); if (ret) goto out_free_delayed; @@ -8756,7 +4586,7 @@ out_free_buf: out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: - unuse_block_rsv(fs_info, block_rsv, blocksize); + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -8918,7 +4748,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, + ret = btrfs_set_disk_extent_flags(trans, eb->start, eb->len, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ @@ -8987,6 +4817,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 parent; struct btrfs_key key; struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -9159,9 +4990,10 @@ skip: wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); - ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, - parent, root->root_key.objectid, - level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + fs_info->nodesize, parent); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; } @@ -9251,21 +5083,23 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_qgroup_trace_leaf_items(trans, eb); - if (ret) { - btrfs_err_rl(fs_info, - "error %d accounting leaf items. Quota is out of sync, rescan required.", + if (is_fstree(root->root_key.objectid)) { + ret = btrfs_qgroup_trace_leaf_items(trans, eb); + if (ret) { + btrfs_err_rl(fs_info, + "error %d accounting leaf items, quota is out of sync, rescan required", ret); + } } } - /* make block locked assertion in clean_tree_block happy */ + /* make block locked assertion in btrfs_clean_tree_block happy */ if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(fs_info, eb); + btrfs_clean_tree_block(eb); } if (eb == root->node) { @@ -9638,7 +5472,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); - extent_buffer_get(parent); + atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -9676,195 +5510,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - num_devices = fs_info->fs_devices->rw_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; - - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ - } - - return flags; -} - -static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - u64 sinfo_used; - u64 min_allocable_bytes; - int ret = -ENOSPC; - - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - - if (cache->ro) { - cache->ro++; - ret = 0; - goto out; - } - - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo_used = btrfs_space_info_used(sinfo, true); - - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { - sinfo->bytes_readonly += num_bytes; - cache->ro++; - list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; - } -out: - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); - if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { - btrfs_info(cache->fs_info, - "unable to make block group %llu ro", - cache->key.objectid); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); - dump_space_info(cache->fs_info, cache->space_info, 0, 0); - } - return ret; -} - -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) - -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_trans_handle *trans; - u64 alloc_flags; - int ret; - -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit - */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; - - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); - - ret = btrfs_wait_for_commit(fs_info, transid); - if (ret) - return ret; - goto again; - } - - /* - * if we are changing raid levels, try to allocate a corresponding - * block group with the new raid level. - */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, alloc_flags, - CHUNK_ALLOC_FORCE); - /* - * ENOSPC is allowed here, we may have enough space - * already allocated at the new raid level to - * carry on - */ - if (ret == -ENOSPC) - ret = 0; - if (ret < 0) - goto out; - } - - ret = inc_block_group_ro(cache, 0); - if (!ret) - goto out; - alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - ret = inc_block_group_ro(cache, 0); -out: - if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(fs_info, cache->flags); - mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, alloc_flags); - mutex_unlock(&fs_info->chunk_mutex); - } - mutex_unlock(&fs_info->ro_block_group_mutex); - - btrfs_end_transaction(trans); - return ret; -} - -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) -{ - u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - - return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); -} - /* * helper to account the unused space of all the readonly block group in the * space_info. takes mirrors into account. */ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; u64 free_bytes = 0; int factor; @@ -9882,9 +5534,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) } factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->key.offset - - btrfs_block_group_used(&block_group->item)) * - factor; + free_bytes += (block_group->length - + block_group->used) * factor; spin_unlock(&block_group->lock); } @@ -9893,1401 +5544,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - - BUG_ON(!cache->ro); - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - if (!--cache->ro) { - num_bytes = cache->key.offset - cache->reserved - - cache->pinned - cache->bytes_super - - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - list_del_init(&cache->ro_list); - } - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); -} - -/* - * Checks to see if it's even possible to relocate this block group. - * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. - */ -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *device; - struct btrfs_trans_handle *trans; - u64 min_free; - u64 dev_min = 1; - u64 dev_nr = 0; - u64 target; - int debug; - int index; - int full = 0; - int ret = 0; - - debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - - /* odd, couldn't find the block group, leave it alone */ - if (!block_group) { - if (debug) - btrfs_warn(fs_info, - "can't find block group for bytenr %llu", - bytenr); - return -1; - } - - min_free = btrfs_block_group_used(&block_group->item); - - /* no bytes used, we're good */ - if (!min_free) - goto out; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - - full = space_info->full; - - /* - * if this is the last block group we have in this space, we can't - * relocate it unless we're able to allocate a new chunk below. - * - * Otherwise, we need to make sure we have room in the space to handle - * all of the extents from this block group. If we can, we're good - */ - if ((space_info->total_bytes != block_group->key.offset) && - (btrfs_space_info_used(space_info, false) + min_free < - space_info->total_bytes)) { - spin_unlock(&space_info->lock); - goto out; - } - spin_unlock(&space_info->lock); - - /* - * ok we don't have enough space, but maybe we have free space on our - * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. if this block - * group is going to be restriped, run checks against the target - * profile instead of the current one. - */ - ret = -1; - - /* - * index: - * 0: raid10 - * 1: raid1 - * 2: dup - * 3: raid0 - * 4: single - */ - target = get_restripe_target(fs_info, block_group->flags); - if (target) { - index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); - } else { - /* - * this is just a balance, so if we were marked as full - * we know there is no space for a new chunk - */ - if (full) { - if (debug) - btrfs_warn(fs_info, - "no space to alloc new chunk for block group %llu", - block_group->key.objectid); - goto out; - } - - index = btrfs_bg_flags_to_raid_index(block_group->flags); - } - - if (index == BTRFS_RAID_RAID10) { - dev_min = 4; - /* Divide by 2 */ - min_free >>= 1; - } else if (index == BTRFS_RAID_RAID1) { - dev_min = 2; - } else if (index == BTRFS_RAID_DUP) { - /* Multiply by 2 */ - min_free <<= 1; - } else if (index == BTRFS_RAID_RAID0) { - dev_min = fs_devices->rw_devices; - min_free = div64_u64(min_free, dev_min); - } - - /* We need to do this so that we can look at pending chunks */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 dev_offset; - - /* - * check to make sure we can actually find a chunk with enough - * space to fit our block group in. - */ - if (device->total_bytes > device->bytes_used + min_free && - !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = find_free_dev_extent(trans, device, min_free, - &dev_offset, NULL); - if (!ret) - dev_nr++; - - if (dev_nr >= dev_min) - break; - - ret = -1; - } - } - if (debug && ret == -1) - btrfs_warn(fs_info, - "no space to allocate a new chunk for block group %llu", - block_group->key.objectid); - mutex_unlock(&fs_info->chunk_mutex); - btrfs_end_transaction(trans); -out: - btrfs_put_block_group(block_group); - return ret; -} - -static int find_first_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - struct btrfs_key *key) -{ - struct btrfs_root *root = fs_info->extent_root; - int ret = 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_block_group_item bg; - u64 flags; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid >= key->objectid && - found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - em_tree = &root->fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, found_key.objectid, - found_key.offset); - read_unlock(&em_tree->lock); - if (!em) { - btrfs_err(fs_info, - "logical %llu len %llu found bg but no related chunk", - found_key.objectid, found_key.offset); - ret = -ENOENT; - } else if (em->start != found_key.objectid || - em->len != found_key.offset) { - btrfs_err(fs_info, - "block group %llu len %llu mismatch with chunk %llu len %llu", - found_key.objectid, found_key.offset, - em->start, em->len); - ret = -EUCLEAN; - } else { - read_extent_buffer(leaf, &bg, - btrfs_item_ptr_offset(leaf, slot), - sizeof(bg)); - flags = btrfs_block_group_flags(&bg) & - BTRFS_BLOCK_GROUP_TYPE_MASK; - - if (flags != (em->map_lookup->type & - BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", - found_key.objectid, - found_key.offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & - em->map_lookup->type)); - ret = -EUCLEAN; - } else { - ret = 0; - } - } - free_extent_map(em); - goto out; - } - path->slots[0]++; - } -out: - return ret; -} - -void btrfs_put_block_group_cache(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - u64 last = 0; - - while (1) { - struct inode *inode; - - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; - spin_unlock(&block_group->lock); - block_group = next_block_group(info, block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->key.objectid + block_group->key.offset; - btrfs_put_block_group(block_group); - } -} - -/* - * Must be called only after stopping all workers, since we could have block - * group caching kthreads running, and therefore they could race with us if we - * freed the block groups before stopping them. - */ -int btrfs_free_block_groups(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_caching_control *caching_ctl; - struct rb_node *n; - - down_write(&info->commit_root_sem); - while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); - list_del(&caching_ctl->list); - put_caching_control(caching_ctl); - } - up_write(&info->commit_root_sem); - - spin_lock(&info->unused_bgs_lock); - while (!list_empty(&info->unused_bgs)) { - block_group = list_first_entry(&info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); - } - spin_unlock(&info->unused_bgs_lock); - - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { - block_group = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); - up_write(&block_group->space_info->groups_sem); - - /* - * We haven't cached this block group, which means we could - * possibly have excluded extents on this block group. - */ - if (block_group->cached == BTRFS_CACHE_NO || - block_group->cached == BTRFS_CACHE_ERROR) - free_excluded_extents(block_group); - - btrfs_remove_free_space_cache(block_group); - ASSERT(block_group->cached != BTRFS_CACHE_STARTED); - ASSERT(list_empty(&block_group->dirty_list)); - ASSERT(list_empty(&block_group->io_list)); - ASSERT(list_empty(&block_group->bg_list)); - ASSERT(atomic_read(&block_group->count) == 1); - btrfs_put_block_group(block_group); - - spin_lock(&info->block_group_cache_lock); - } - spin_unlock(&info->block_group_cache_lock); - - /* now that all the block groups are freed, go through and - * free all the space_info structs. This is only called during - * the final stages of unmount, and so we know nobody is - * using them. We call synchronize_rcu() once before we start, - * just to be on the safe side. - */ - synchronize_rcu(); - - release_global_block_rsv(info); - - while (!list_empty(&info->space_info)) { - int i; - - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) - dump_space_info(info, space_info, 0, 0); - list_del(&space_info->list); - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - struct kobject *kobj; - kobj = space_info->block_group_kobjs[i]; - space_info->block_group_kobjs[i] = NULL; - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); - } - } - kobject_del(&space_info->kobj); - kobject_put(&space_info->kobj); - } - return 0; -} - -/* link_block_group will queue up kobjects to add when we're reclaim-safe */ -void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - struct raid_kobject *rkobj; - LIST_HEAD(list); - int index; - int ret = 0; - - spin_lock(&fs_info->pending_raid_kobjs_lock); - list_splice_init(&fs_info->pending_raid_kobjs, &list); - spin_unlock(&fs_info->pending_raid_kobjs_lock); - - list_for_each_entry(rkobj, &list, list) { - space_info = __find_space_info(fs_info, rkobj->flags); - index = btrfs_bg_flags_to_raid_index(rkobj->flags); - - ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); - if (ret) { - kobject_put(&rkobj->kobj); - break; - } - } - if (ret) - btrfs_warn(fs_info, - "failed to add kobject for block cache, ignoring"); -} - -static void link_block_group(struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - struct btrfs_fs_info *fs_info = cache->fs_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; - - down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); - - if (first) { - struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); - if (!rkobj) { - btrfs_warn(cache->fs_info, - "couldn't alloc memory for raid level kobject"); - return; - } - rkobj->flags = cache->flags; - kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - - spin_lock(&fs_info->pending_raid_kobjs_lock); - list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); - spin_unlock(&fs_info->pending_raid_kobjs_lock); - space_info->block_group_kobjs[index] = &rkobj->kobj; - } -} - -static struct btrfs_block_group_cache * -btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, - u64 start, u64 size) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - - cache->key.objectid = start; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - - cache->fs_info = fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - init_rwsem(&cache->data_rwsem); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - INIT_LIST_HEAD(&cache->ro_list); - INIT_LIST_HEAD(&cache->dirty_list); - INIT_LIST_HEAD(&cache->io_list); - btrfs_init_free_space_ctl(cache); - atomic_set(&cache->trimming, 0); - mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); - - return cache; -} - - -/* - * Iterate all chunks and verify that each of them has the corresponding block - * group - */ -static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) -{ - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group_cache *bg; - u64 start = 0; - int ret = 0; - - while (1) { - read_lock(&map_tree->map_tree.lock); - /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to - * get the first chunk. - */ - em = lookup_extent_mapping(&map_tree->map_tree, start, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) - break; - - bg = btrfs_lookup_block_group(fs_info, em->start); - if (!bg) { - btrfs_err(fs_info, - "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); - ret = -EUCLEAN; - free_extent_map(em); - break; - } - if (bg->key.objectid != em->start || - bg->key.offset != em->len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, - bg->key.objectid, bg->key.offset, - bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); - ret = -EUCLEAN; - free_extent_map(em); - btrfs_put_block_group(bg); - break; - } - start = em->start + em->len; - free_extent_map(em); - btrfs_put_block_group(bg); - } - return ret; -} - -int btrfs_read_block_groups(struct btrfs_fs_info *info) -{ - struct btrfs_path *path; - int ret; - struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf; - int need_clear = 0; - u64 cache_gen; - u64 feature; - int mixed; - - feature = btrfs_super_incompat_flags(info->super_copy); - mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); - - key.objectid = 0; - key.offset = 0; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - cache_gen = btrfs_super_cache_generation(info->super_copy); - if (btrfs_test_opt(info, SPACE_CACHE) && - btrfs_super_generation(info->super_copy) != cache_gen) - need_clear = 1; - if (btrfs_test_opt(info, CLEAR_CACHE)) - need_clear = 1; - - while (1) { - ret = find_first_block_group(info, path, &key); - if (ret > 0) - break; - if (ret != 0) - goto error; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - cache = btrfs_create_block_group_cache(info, found_key.objectid, - found_key.offset); - if (!cache) { - ret = -ENOMEM; - goto error; - } - - if (need_clear) { - /* - * When we mount with old space cache, we need to - * set BTRFS_DC_CLEAR and set dirty flag. - * - * a) Setting 'BTRFS_DC_CLEAR' makes sure that we - * truncate the old free space cache inode and - * setup a new one. - * b) Setting 'dirty flag' makes sure that we flush - * the new space cache info onto disk. - */ - if (btrfs_test_opt(info, SPACE_CACHE)) - cache->disk_cache_state = BTRFS_DC_CLEAR; - } - - read_extent_buffer(leaf, &cache->item, - btrfs_item_ptr_offset(leaf, path->slots[0]), - sizeof(cache->item)); - cache->flags = btrfs_block_group_flags(&cache->item); - if (!mixed && - ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && - (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(info, -"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", - cache->key.objectid); - ret = -EINVAL; - goto error; - } - - key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(path); - - /* - * We need to exclude the super stripes now so that the space - * info has super bytes accounted for, otherwise we'll think - * we have more space than we actually do. - */ - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - goto error; - } - - /* - * check for two cases, either we are full, and therefore - * don't need to bother with the caching work since we won't - * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _a_lot_ of - * time, particularly in the full case. - */ - if (found_key.offset == btrfs_block_group_used(&cache->item)) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(cache); - } else if (btrfs_block_group_used(&cache->item) == 0) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, found_key.objectid, - found_key.objectid + - found_key.offset); - free_excluded_extents(cache); - } - - ret = btrfs_add_block_group_cache(info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - goto error; - } - - trace_btrfs_add_block_group(info, cache, 0); - update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - - cache->space_info = space_info; - - link_block_group(cache); - - set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->key.objectid)) { - inc_block_group_ro(cache, 1); - } else if (btrfs_block_group_used(&cache->item) == 0) { - ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); - } - } - - list_for_each_entry_rcu(space_info, &info->space_info, list) { - if (!(get_alloc_profile(info, space_info->flags) & - (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_DUP))) - continue; - /* - * avoid allocating from un-mirrored block group if there are - * mirrored block groups. - */ - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_RAID0], - list) - inc_block_group_ro(cache, 1); - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_SINGLE], - list) - inc_block_group_ro(cache, 1); - } - - btrfs_add_raid_kobjects(info); - init_global_block_rsv(info); - ret = check_chunk_block_group_mappings(info); -error: - btrfs_free_path(path); - return ret; -} - -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_block_group_item item; - struct btrfs_key key; - int ret = 0; - - if (!trans->can_flush_pending_bgs) - return; - - while (!list_empty(&trans->new_bgs)) { - block_group = list_first_entry(&trans->new_bgs, - struct btrfs_block_group_cache, - bg_list); - if (ret) - goto next; - - spin_lock(&block_group->lock); - memcpy(&item, &block_group->item, sizeof(item)); - memcpy(&key, &block_group->key, sizeof(key)); - spin_unlock(&block_group->lock); - - ret = btrfs_insert_item(trans, extent_root, &key, &item, - sizeof(item)); - if (ret) - btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); - if (ret) - btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); - /* already aborted the transaction if it failed. */ -next: - btrfs_delayed_refs_rsv_release(fs_info, 1); - list_del_init(&block_group->bg_list); - } - btrfs_trans_release_chunk_metadata(trans); -} - -int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, - u64 type, u64 chunk_offset, u64 size) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - int ret; - - btrfs_set_log_full_commit(fs_info, trans); - - cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); - if (!cache) - return -ENOMEM; - - btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_block_group_flags(&cache->item, type); - - cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - cache->needs_free_space = 1; - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - return ret; - } - - add_new_free_space(cache, chunk_offset, chunk_offset + size); - - free_excluded_extents(cache); - -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif - /* - * Ensure the corresponding space_info object is created and - * assigned to our block group. We want our bg to be added to the rbtree - * with its ->space_info set. - */ - cache->space_info = __find_space_info(fs_info, cache->flags); - ASSERT(cache->space_info); - - ret = btrfs_add_block_group_cache(fs_info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; - } - - /* - * Now that our block group has its ->space_info set and is inserted in - * the rbtree, update the space info's counters. - */ - trace_btrfs_add_block_group(fs_info, cache, 1); - update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); - update_global_block_rsv(fs_info); - - link_block_group(cache); - - list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); - - set_avail_alloc_bits(fs_info, type); - return 0; -} - -static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits &= ~extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} - -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_path *path; - struct btrfs_block_group_cache *block_group; - struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_key key; - struct inode *inode; - struct kobject *kobj = NULL; - int ret; - int index; - int factor; - struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; - bool remove_rsv = false; - - block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!block_group); - BUG_ON(!block_group->ro); - - trace_btrfs_remove_block_group(block_group); - /* - * Free the reserved super bytes from this block group before - * remove it. - */ - free_excluded_extents(block_group); - btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, - block_group->key.offset); - - memcpy(&key, &block_group->key, sizeof(key)); - index = btrfs_bg_flags_to_raid_index(block_group->flags); - factor = btrfs_bg_type_to_factor(block_group->flags); - - /* make sure this block group isn't part of an allocation cluster */ - cluster = &fs_info->data_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - /* - * make sure this block group isn't part of a metadata - * allocation cluster - */ - cluster = &fs_info->meta_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - /* - * get the inode first so any iput calls done for the io_list - * aren't the final iput (no unlinks allowed now) - */ - inode = lookup_free_space_inode(fs_info, block_group, path); - - mutex_lock(&trans->transaction->cache_write_mutex); - /* - * Make sure our free space cache IO is done before removing the - * free space inode - */ - spin_lock(&trans->transaction->dirty_bgs_lock); - if (!list_empty(&block_group->io_list)) { - list_del_init(&block_group->io_list); - - WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); - - spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_wait_cache_io(trans, block_group, path); - btrfs_put_block_group(block_group); - spin_lock(&trans->transaction->dirty_bgs_lock); - } - - if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - remove_rsv = true; - btrfs_put_block_group(block_group); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - mutex_unlock(&trans->transaction->cache_write_mutex); - - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); - } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); - } - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = block_group->key.objectid; - key.type = 0; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); - } - - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - - if (fs_info->first_logical_byte == block_group->key.objectid) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - /* - * we must use list_del_init so people can check to see if they - * are still on the list after taking the semaphore - */ - list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) { - kobj = block_group->space_info->block_group_kobjs[index]; - block_group->space_info->block_group_kobjs[index] = NULL; - clear_avail_alloc_bits(fs_info, block_group->flags); - } - up_write(&block_group->space_info->groups_sem); - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); - } - - if (block_group->has_caching_ctl) - caching_ctl = get_caching_control(block_group); - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - down_write(&fs_info->commit_root_sem); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - up_write(&fs_info->commit_root_sem); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - put_caching_control(caching_ctl); - put_caching_control(caching_ctl); - } - } - - spin_lock(&trans->transaction->dirty_bgs_lock); - WARN_ON(!list_empty(&block_group->dirty_list)); - WARN_ON(!list_empty(&block_group->io_list)); - spin_unlock(&trans->transaction->dirty_bgs_lock); - - btrfs_remove_free_space_cache(block_group); - - spin_lock(&block_group->space_info->lock); - list_del_init(&block_group->ro_list); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - WARN_ON(block_group->space_info->total_bytes - < block_group->key.offset); - WARN_ON(block_group->space_info->bytes_readonly - < block_group->key.offset); - WARN_ON(block_group->space_info->disk_total - < block_group->key.offset * factor); - } - block_group->space_info->total_bytes -= block_group->key.offset; - block_group->space_info->bytes_readonly -= block_group->key.offset; - block_group->space_info->disk_total -= block_group->key.offset * factor; - - spin_unlock(&block_group->space_info->lock); - - memcpy(&key, &block_group->key, sizeof(key)); - - mutex_lock(&fs_info->chunk_mutex); - if (!list_empty(&em->list)) { - /* We're in the transaction->pending_chunks list. */ - free_extent_map(em); - } - spin_lock(&block_group->lock); - block_group->removed = 1; - /* - * At this point trimming can't start on this block group, because we - * removed the block group from the tree fs_info->block_group_cache_tree - * so no one can't find it anymore and even if someone already got this - * block group before we removed it from the rbtree, they have already - * incremented block_group->trimming - if they didn't, they won't find - * any free space entries because we already removed them all when we - * called btrfs_remove_free_space_cache(). - * - * And we must not remove the extent map from the fs_info->mapping_tree - * to prevent the same logical address range and physical device space - * ranges from being reused for a new block group. This is because our - * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is - * completely transactionless, so while it is trimming a range the - * currently running transaction might finish and a new one start, - * allowing for new block groups to be created that can reuse the same - * physical device locations unless we take this special care. - * - * There may also be an implicit trim operation if the file system - * is mounted with -odiscard. The same protections must remain - * in place until the extents have been discarded completely when - * the transaction commit has completed. - */ - remove_em = (atomic_read(&block_group->trimming) == 0); - /* - * Make sure a trimmer task always sees the em in the pinned_chunks list - * if it sees block_group->removed == 1 (needs to lock block_group->lock - * before checking block_group->removed). - */ - if (!remove_em) { - /* - * Our em might be in trans->transaction->pending_chunks which - * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), - * and so is the fs_info->pinned_chunks list. - * - * So at this point we must be holding the chunk_mutex to avoid - * any races with chunk allocation (more specifically at - * volumes.c:contains_pending_extent()), to ensure it always - * sees the em, either in the pending_chunks list or in the - * pinned_chunks list. - */ - list_move_tail(&em->list, &fs_info->pinned_chunks); - } - spin_unlock(&block_group->lock); - - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree.map_tree; - write_lock(&em_tree->lock); - /* - * The em might be in the pending_chunks list, so make sure the - * chunk mutex is locked, since remove_extent_mapping() will - * delete us from that list. - */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } - - mutex_unlock(&fs_info->chunk_mutex); - - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out; - - btrfs_put_block_group(block_group); - btrfs_put_block_group(block_group); - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; - if (ret < 0) - goto out; - - ret = btrfs_del_item(trans, root, path); -out: - if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); - btrfs_free_path(path); - return ret; -} - -struct btrfs_trans_handle * -btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, - const u64 chunk_offset) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; - struct extent_map *em; - struct map_lookup *map; - unsigned int num_items; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); - - /* - * We need to reserve 3 + N units from the metadata space info in order - * to remove a block group (done at btrfs_remove_chunk() and at - * btrfs_remove_block_group()), which are used for: - * - * 1 unit for adding the free space inode's orphan (located in the tree - * of tree roots). - * 1 unit for deleting the block group item (located in the extent - * tree). - * 1 unit for deleting the free space item (located in tree of tree - * roots). - * N units for deleting N device extent items corresponding to each - * stripe (located in the device tree). - * - * In order to remove a block group we also need to reserve units in the - * system space info in order to update the chunk tree (update one or - * more device items and remove one chunk item), but this is done at - * btrfs_remove_chunk() through a call to check_system_chunk(). - */ - map = em->map_lookup; - num_items = 3 + map->num_stripes; - free_extent_map(em); - - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items, 1); -} - -/* - * Process the unused_bgs list and remove any that don't have any allocated - * space inside of them. - */ -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - int ret = 0; - - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) - return; - - spin_lock(&fs_info->unused_bgs_lock); - while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; - int trimming; - - block_group = list_first_entry(&fs_info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - - space_info = block_group->space_info; - - if (ret || btrfs_mixed_space_info(space_info)) { - btrfs_put_block_group(block_group); - continue; - } - spin_unlock(&fs_info->unused_bgs_lock); - - mutex_lock(&fs_info->delete_unused_bgs_mutex); - - /* Don't want to race with allocators so take the groups_sem */ - down_write(&space_info->groups_sem); - spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - btrfs_block_group_used(&block_group->item) || - block_group->ro || - list_is_singular(&block_group->list)) { - /* - * We want to bail if we made new allocations or have - * outstanding allocations in this block group. We do - * the ro check in case balance is currently acting on - * this block group. - */ - trace_btrfs_skip_unused_block_group(block_group); - spin_unlock(&block_group->lock); - up_write(&space_info->groups_sem); - goto next; - } - spin_unlock(&block_group->lock); - - /* We don't want to force the issue, only flip if it's ok. */ - ret = inc_block_group_ro(block_group, 0); - up_write(&space_info->groups_sem); - if (ret < 0) { - ret = 0; - goto next; - } - - /* - * Want to do this before we do anything else so we can recover - * properly if we fail to join the transaction. - */ - trans = btrfs_start_trans_remove_block_group(fs_info, - block_group->key.objectid); - if (IS_ERR(trans)) { - btrfs_dec_block_group_ro(block_group); - ret = PTR_ERR(trans); - goto next; - } - - /* - * We could have pending pinned extents for this block group, - * just delete them, we don't care about them anymore. - */ - start = block_group->key.objectid; - end = start + block_group->key.offset - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - - /* Reset pinned so btrfs_put_block_group doesn't complain */ - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - - update_bytes_pinned(space_info, -block_group->pinned); - space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - block_group->pinned = 0; - - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD); - - /* Implicit trim during transaction commit. */ - if (trimming) - btrfs_get_block_group_trimming(block_group); - - /* - * Btrfs_remove_chunk will abort the transaction if things go - * horribly wrong. - */ - ret = btrfs_remove_chunk(trans, block_group->key.objectid); - - if (ret) { - if (trimming) - btrfs_put_block_group_trimming(block_group); - goto end_trans; - } - - /* - * If we're not mounted with -odiscard, we can just forget - * about this block group. Otherwise we'll need to wait - * until transaction commit to do the actual discard. - */ - if (trimming) { - spin_lock(&fs_info->unused_bgs_lock); - /* - * A concurrent scrub might have added us to the list - * fs_info->unused_bgs, so use a list_move operation - * to add the block group to the deleted_bgs list. - */ - list_move(&block_group->bg_list, - &trans->transaction->deleted_bgs); - spin_unlock(&fs_info->unused_bgs_lock); - btrfs_get_block_group(block_group); - } -end_trans: - btrfs_end_transaction(trans); -next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - btrfs_put_block_group(block_group); - spin_lock(&fs_info->unused_bgs_lock); - } - spin_unlock(&fs_info->unused_bgs_lock); -} - -int btrfs_init_space_info(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *disk_super; - u64 features; - u64 flags; - int mixed = 0; - int ret; - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - return -EINVAL; - - features = btrfs_super_incompat_flags(disk_super); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - if (mixed) { - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } else { - flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } -out: - return ret; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -11314,10 +5570,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * it while performing the free space search since we have already * held back allocations. */ -static int btrfs_trim_free_extents(struct btrfs_device *device, - u64 minlen, u64 *trimmed) +static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = 0, len = 0; + u64 start = SZ_1M, len = 0, end = 0; int ret; *trimmed = 0; @@ -11338,43 +5593,41 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, while (1) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_transaction *trans; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; - ret = down_read_killable(&fs_info->commit_root_sem); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); - break; - } + find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); + /* Ensure we skip the reserved area in the first 1M */ + start = max_t(u64, start, SZ_1M); - if (!trans) - up_read(&fs_info->commit_root_sem); + /* + * If find_first_clear_extent_bit find a range that spans the + * end of the device it will set end to -1, in this case it's up + * to the caller to trim the value to the size of the device. + */ + end = min(end, device->total_bytes - 1); - ret = find_free_dev_extent_start(trans, device, minlen, start, - &start, &len); - if (trans) { - up_read(&fs_info->commit_root_sem); - btrfs_put_transaction(trans); - } + len = end - start + 1; - if (ret) { + /* We didn't find any extents */ + if (!len) { mutex_unlock(&fs_info->chunk_mutex); - if (ret == -ENOSPC) - ret = 0; + ret = 0; break; } - ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + ret = btrfs_issue_discard(device->bdev, start, len, + &bytes); + if (!ret) + set_extent_bits(&device->alloc_state, start, + start + bytes - 1, + CHUNK_TRIMMED); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -11405,10 +5658,11 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_device *device; struct list_head *devices; u64 group_trimmed; + u64 range_end = U64_MAX; u64 start; u64 end; u64 trimmed = 0; @@ -11418,26 +5672,33 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int dev_ret = 0; int ret = 0; + /* + * Check range overflow if range->len is set. + * The default range->len is U64_MAX. + */ + if (range->len != U64_MAX && + check_add_overflow(range->start, range->len, &range_end)) + return -EINVAL; + cache = btrfs_lookup_first_block_group(fs_info, range->start); - for (; cache; cache = next_block_group(fs_info, cache)) { - if (cache->key.objectid >= (range->start + range->len)) { + for (; cache; cache = btrfs_next_block_group(cache)) { + if (cache->start >= range_end) { btrfs_put_block_group(cache); break; } - start = max(range->start, cache->key.objectid); - end = min(range->start + range->len, - cache->key.objectid + cache->key.offset); + start = max(range->start, cache->start); + end = min(range_end, cache->start + cache->length); if (end - start >= range->minlen) { - if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, 0); + if (!btrfs_block_group_done(cache)) { + ret = btrfs_cache_block_group(cache, 0); if (ret) { bg_failed++; bg_ret = ret; continue; } - ret = wait_block_group_cache_done(cache); + ret = btrfs_wait_block_group_cache_done(cache); if (ret) { bg_failed++; bg_ret = ret; @@ -11466,8 +5727,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - ret = btrfs_trim_free_extents(device, range->minlen, - &group_trimmed); + ret = btrfs_trim_free_extents(device, &group_trimmed); if (ret) { dev_failed++; dev_ret = ret; @@ -11531,16 +5791,3 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) !atomic_read(&root->will_be_snapshotted)); } } - -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) -{ - struct btrfs_fs_info *fs_info = bg->fs_info; - - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ca8b8e785cf3..2f4802f405a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,6 +14,7 @@ #include <linux/prefetch.h> #include <linux/cleancache.h> #include "extent_io.h" +#include "extent-io-tree.h" #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" @@ -59,12 +60,23 @@ void btrfs_leak_debug_del(struct list_head *entry) spin_unlock_irqrestore(&leak_lock, flags); } -static inline -void btrfs_leak_debug_check(void) +static inline void btrfs_extent_buffer_leak_debug_check(void) { - struct extent_state *state; struct extent_buffer *eb; + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", @@ -74,14 +86,6 @@ void btrfs_leak_debug_check(void) list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } } #define btrfs_debug_check_extent_io_range(tree, start, end) \ @@ -105,12 +109,11 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #else #define btrfs_leak_debug_add(new, head) do {} while (0) #define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_extent_buffer_leak_debug_check() do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif -#define BUFFER_LRU_MAX 64 - struct tree_entry { u64 start; u64 end; @@ -151,49 +154,70 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct bio_vec bv; struct extent_io_tree *tree = bio->bi_private; - u64 start; - - mp_bvec_last_segment(bvec, &bv); - start = page_offset(bv.bv_page) + bv.bv_offset; bio->bi_private = NULL; if (tree->ops) ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); + mirror_num, bio_flags); else btrfsic_submit_bio(bio); return blk_status_to_errno(ret); } -static void flush_write_bio(struct extent_page_data *epd) +/* Cleanup unsubmitted bios */ +static void end_write_bio(struct extent_page_data *epd, int ret) { if (epd->bio) { - int ret; + epd->bio->bi_status = errno_to_blk_status(ret); + bio_endio(epd->bio); + epd->bio = NULL; + } +} + +/* + * Submit bio from extent page data via submit_one_bio + * + * Return 0 if everything is OK. + * Return <0 for error. + */ +static int __must_check flush_write_bio(struct extent_page_data *epd) +{ + int ret = 0; + if (epd->bio) { ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ + /* + * Clean up of epd->bio is handled by its endio function. + * And endio is either triggered by successful bio execution + * or the error handler of submit bio hook. + * So at this point, no matter what happened, we don't need + * to clean up epd->bio. + */ epd->bio = NULL; } + return ret; } -int __init extent_io_init(void) +int __init extent_state_cache_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; + return 0; +} +int __init extent_io_init(void) +{ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) - goto free_state_cache; + return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_io_bio, bio), @@ -211,35 +235,68 @@ free_bioset: free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; + return -ENOMEM; +} -free_state_cache: +void __cold extent_state_cache_exit(void) +{ + btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); - extent_state_cache = NULL; - return -ENOMEM; } void __cold extent_io_exit(void) { - btrfs_leak_debug_check(); + btrfs_extent_buffer_leak_debug_check(); /* * Make sure all delayed rcu free are flushed before we * destroy caches. */ rcu_barrier(); - kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); bioset_exit(&btrfs_bioset); } -void extent_io_tree_init(struct extent_io_tree *tree, +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, void *private_data) { + tree->fs_info = fs_info; tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; + tree->owner = owner; +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -312,6 +369,24 @@ do_insert: return NULL; } +/** + * __etree_search - searche @tree for an entry that contains @offset. Such + * entry would have entry->start <= offset && entry->end >= offset. + * + * @tree - the tree to search + * @offset - offset that should fall within an entry in @tree + * @next_ret - pointer to the first entry whose range ends after @offset + * @prev - pointer to the first entry whose range begins before @offset + * @p_ret - pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret - points to entry which would have been the parent of the entry, + * containing @offset + * + * This function returns a pointer to the entry that contains @offset byte + * address. If no such entry exists, then NULL is returned and the other + * pointer arguments to the function are filled, otherwise the found entry is + * returned and other pointers are left untouched. + */ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, struct rb_node **next_ret, struct rb_node **prev_ret, @@ -400,7 +475,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *other; struct rb_node *other_node; - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; other_node = rb_prev(&state->rb_node); @@ -457,9 +532,11 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", - end, start); + if (end < start) { + btrfs_err(tree->fs_info, + "insert state: end < start %llu %llu", end, start); + WARN_ON(1); + } state->start = start; state->end = end; @@ -469,7 +546,8 @@ static int insert_state(struct extent_io_tree *tree, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", found->start, found->end, start, end); return -EEXIST; } @@ -611,6 +689,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear = 0; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; @@ -618,7 +697,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (delete) bits |= ~EXTENT_CTLBITS; - if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc && gfpflags_allow_blocking(mask)) { @@ -850,7 +929,7 @@ static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { return cache_state_if_flags(state, cached_ptr, - EXTENT_IOBITS | EXTENT_BOUNDARY); + EXTENT_LOCKED | EXTENT_BOUNDARY); } /* @@ -880,6 +959,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_end; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); again: if (!prealloc && gfpflags_allow_blocking(mask)) { @@ -1112,6 +1192,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); again: if (!prealloc) { @@ -1311,6 +1393,13 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits) +{ + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); +} + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached) @@ -1478,15 +1567,128 @@ out: return ret; } +/** + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. + * + * @tree - the tree to search + * @start - the offset at/after which the found extent should start + * @start_ret - records the beginning of the range + * @end_ret - records the end of the range (inclusive) + * @bits - the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + struct rb_node *node, *prev = NULL, *next; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + node = __etree_search(tree, start, &next, &prev, NULL, NULL); + if (!node) { + node = next; + if (!node) { + /* + * We are past the last allocated chunk, + * set start at the end of the last extent. The + * device alloc tree should never be empty so + * prev is always set. + */ + ASSERT(prev); + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } + } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ + state = rb_entry(node, struct extent_state, rb_node); + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); +} + /* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * * true is returned if we find something, false if nothing was in the tree */ -static noinline bool find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes, - struct extent_state **cached_state) +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; @@ -1588,10 +1790,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1604,8 +1806,8 @@ again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; - found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes, &cached_state); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); if (!found || delalloc_end <= *start) { *start = delalloc_start; *end = delalloc_end; @@ -1707,7 +1909,7 @@ static int __process_pages_contig(struct address_space *mapping, if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); - if (pages[i] == locked_page) { + if (locked_page && pages[i] == locked_page) { put_page(pages[i]); pages_locked++; continue; @@ -1746,9 +1948,9 @@ out: } void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -1822,8 +2024,8 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec) { struct rb_node *node; struct extent_state *state; @@ -1850,8 +2052,8 @@ out: return ret; } -static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec) +int get_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record **failrec) { struct rb_node *node; struct extent_state *state; @@ -2061,9 +2263,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, return 0; } -int repair_eb_io_failure(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int mirror_num) +int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -2342,7 +2544,6 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio_set_dev(bio, fs_info->fs_devices->latest_bdev); bio->bi_iter.bi_size = 0; bio->bi_private = data; @@ -2409,7 +2610,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, read_mode, failrec->this_mirror, failrec->in_validation); status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, - failrec->bio_flags, 0); + failrec->bio_flags); if (status) { free_io_failure(failure_tree, tree, failrec); bio_put(bio); @@ -2451,11 +2652,10 @@ static void end_bio_extent_writepage(struct bio *bio) struct bio_vec *bvec; u64 start; u64 end; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2523,11 +2723,10 @@ static void end_bio_extent_readpage(struct bio *bio) u64 extent_len = 0; int mirror; int ret; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2607,8 +2806,6 @@ static void end_bio_extent_readpage(struct bio *bio) if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(eb, -EIO); - - ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -2673,12 +2870,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) * never fail. We're returning a bio right now but you can call btrfs_io_bio * for the appropriate container_of magic */ -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2733,7 +2929,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * a contiguous page to the previous one * @size: portion of page that we want to write * @offset: starting offset in the page - * @bdev: attach newly created bios to this bdev * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -2744,7 +2939,6 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, - struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, @@ -2784,20 +2978,24 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio = NULL; } else { if (wbc) - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); return 0; } } - bio = btrfs_bio_alloc(bdev, offset); + bio = btrfs_bio_alloc(offset); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; bio->bi_opf = opf; if (wbc) { + struct block_device *bdev; + + bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); } *bio_ret = bio; @@ -2877,7 +3075,6 @@ static int __do_readpage(struct extent_io_tree *tree, u64 block_start; u64 cur_end; struct extent_map *em; - struct block_device *bdev; int ret = 0; int nr = 0; size_t pg_offset = 0; @@ -2954,7 +3151,6 @@ static int __do_readpage(struct extent_io_tree *tree, offset = em->block_start + extent_offset; disk_io_size = iosize; } - bdev = em->bdev; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3044,7 +3240,7 @@ static int __do_readpage(struct extent_io_tree *tree, ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, page, offset, disk_io_size, - pg_offset, bdev, bio, + pg_offset, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3069,7 +3265,7 @@ out: return ret; } -static inline void __do_contiguous_readpages(struct extent_io_tree *tree, +static inline void contiguous_readpages(struct extent_io_tree *tree, struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3077,21 +3273,10 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, unsigned long *bio_flags, u64 *prev_em_start) { - struct inode *inode; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - inode = pages[0]->mapping->host; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - end - start + 1); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, @@ -3100,46 +3285,6 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, } } -static void __extent_readpages(struct extent_io_tree *tree, - struct page *pages[], - int nr_pages, - struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, - u64 *prev_em_start) -{ - u64 start = 0; - u64 end = 0; - u64 page_start; - int index; - int first_index = 0; - - for (index = 0; index < nr_pages; index++) { - page_start = page_offset(pages[index]); - if (!end) { - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } else if (end + 1 == page_start) { - end += PAGE_SIZE; - } else { - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, - bio, bio_flags, - prev_em_start); - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } - } - - if (end) - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, bio, - bio_flags, prev_em_start); -} - static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, @@ -3147,22 +3292,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags, unsigned int read_flags) { - struct inode *inode = page->mapping->host; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; int ret; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - PAGE_SIZE); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); @@ -3203,7 +3338,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; bool found; u64 delalloc_to_write = 0; @@ -3213,8 +3347,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, tree, - page, + found = find_lock_delalloc_range(inode, page, &delalloc_start, &delalloc_end); if (!found) { @@ -3223,7 +3356,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); - /* File system has been set read-only */ if (ret) { SetPageError(page); /* @@ -3285,7 +3417,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, struct extent_page_data *epd, loff_t i_size, unsigned long nr_written, - unsigned int write_flags, int *nr_ret) + int *nr_ret) { struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); @@ -3296,11 +3428,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 block_start; u64 iosize; struct extent_map *em; - struct block_device *bdev; size_t pg_offset = 0; size_t blocksize; int ret = 0; int nr = 0; + const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; ret = btrfs_writepage_cow_fixup(page, start, page_end); @@ -3354,7 +3486,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); offset = em->block_start + extent_offset; - bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); @@ -3396,7 +3527,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, page, offset, iosize, pg_offset, - bdev, &epd->bio, + &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3419,6 +3550,9 @@ done: * records are inserted to lock ranges in the tree, and as dirty areas * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges + * + * Return 0 if everything goes well. + * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3431,11 +3565,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned int write_flags = 0; unsigned long nr_written = 0; - write_flags = wbc_to_write_flags(wbc); - trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); @@ -3473,7 +3604,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } ret = __extent_writepage_io(inode, page, wbc, epd, - i_size, nr_written, write_flags, &nr); + i_size, nr_written, &nr); if (ret == 1) goto done_unlocked; @@ -3488,6 +3619,7 @@ done: end_extent_writepage(page, ret, start, page_end); } unlock_page(page); + ASSERT(ret <= 0); return ret; done_unlocked: @@ -3500,18 +3632,33 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static noinline_for_stack int -lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +/* + * Lock eb pages and flush the bio if we can't the locks + * + * Return 0 if nothing went wrong + * Return >0 is same as 0, except bio is not submitted + * Return <0 if something went wrong, no page is locked + */ +static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) { - int i, num_pages; + struct btrfs_fs_info *fs_info = eb->fs_info; + int i, num_pages, failed_page_nr; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { + ret = flush_write_bio(epd); + if (ret < 0) + return ret; flush = 1; - flush_write_bio(epd); btrfs_tree_lock(eb); } @@ -3520,7 +3667,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + if (ret < 0) + return ret; flush = 1; } while (1) { @@ -3561,7 +3710,14 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + int err; + + err = flush_write_bio(epd); + if (err < 0) { + ret = err; + failed_page_nr = i; + goto err_unlock; + } flush = 1; } lock_page(p); @@ -3569,24 +3725,45 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, } return ret; -} - -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +err_unlock: + /* Unlock already locked pages */ + for (i = 0; i < failed_page_nr; i++) + unlock_page(eb->pages[i]); + /* + * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. + * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can + * be made and undo everything done before. + */ + btrfs_tree_lock(eb); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + end_extent_buffer_writeback(eb); + spin_unlock(&eb->refs_lock); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, + fs_info->dirty_metadata_batch); + btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_tree_unlock(eb); + return ret; } static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -3643,11 +3820,11 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) { struct bio_vec *bvec; struct extent_buffer *eb; - int i, done; + int done; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; eb = (struct extent_buffer *)page->private; @@ -3672,11 +3849,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) } static noinline_for_stack int write_one_eb(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) { - struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; @@ -3701,7 +3877,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); memzero_extent_buffer(eb, start, end - start); } @@ -3711,7 +3887,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset, PAGE_SIZE, 0, bdev, + p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3744,7 +3920,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, @@ -3819,13 +3994,17 @@ retry: continue; prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + ret = lock_extent_buffer_for_io(eb, &epd); if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } - ret = write_one_eb(eb, fs_info, wbc, &epd); + ret = write_one_eb(eb, wbc, &epd); if (ret) { done = 1; free_extent_buffer(eb); @@ -3852,7 +4031,12 @@ retry: index = 0; goto retry; } - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } @@ -3940,7 +4124,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - done_index = page->index; + done_index = page->index + 1; /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -3949,7 +4133,8 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); lock_page(page); } @@ -3959,8 +4144,10 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) - flush_write_bio(epd); + if (PageWriteback(page)) { + ret = flush_write_bio(epd); + BUG_ON(ret < 0); + } wait_on_page_writeback(page); } @@ -3971,22 +4158,7 @@ retry: } ret = __extent_writepage(page, wbc, epd); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } if (ret < 0) { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done_index = page->index + 1; done = 1; break; } @@ -4029,8 +4201,14 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) }; ret = __extent_writepage(page, wbc, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } - flush_write_bio(&epd); + ret = flush_write_bio(&epd); + ASSERT(ret <= 0); return ret; } @@ -4055,8 +4233,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, + /* We're called from an async helper function */ + .punt_to_cgroup = 1, + .no_cgroup_owner = 1, }; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (start <= end) { page = find_get_page(mapping, start >> PAGE_SHIFT); if (clear_page_dirty_for_io(page)) @@ -4070,7 +4252,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, start += PAGE_SIZE; } - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret == 0) + ret = flush_write_bio(&epd); + else + end_write_bio(&epd, ret); + + wbc_detach_inode(&wbc_writepages); return ret; } @@ -4086,7 +4274,12 @@ int extent_writepages(struct address_space *mapping, }; ret = extent_write_cache_pages(mapping, wbc, &epd); - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } @@ -4102,6 +4295,8 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, u64 prev_em_start = (u64)-1; while (!list_empty(pages)) { + u64 contig_end = 0; + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { struct page *page = lru_to_page(pages); @@ -4110,14 +4305,22 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, if (add_to_page_cache_lru(page, mapping, page->index, readahead_gfp_mask(mapping))) { put_page(page); - continue; + break; } pagepool[nr++] = page; + contig_end = page_offset(page) + PAGE_SIZE - 1; } - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); + if (nr) { + u64 contig_start = page_offset(pagepool[0]); + + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + + contiguous_readpages(tree, pagepool, nr, contig_start, + contig_end, &em_cached, &bio, &bio_flags, + &prev_em_start); + } } if (em_cached) @@ -4147,10 +4350,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); return 0; } @@ -4166,10 +4367,9 @@ static int try_release_extent_state(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, - EXTENT_IOBITS, 0, NULL)) + if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; - else { + } else { /* * at this point we can safely clear everything except the * locked bit and the nodatasum bit @@ -4222,8 +4422,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK, - 0, NULL)) { + EXTENT_LOCKED, 0, NULL)) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); @@ -4372,8 +4571,7 @@ try_submit_last: * In this case, the first extent range will be cached but not emitted. * So we must emit it before ending extent_fiemap(). */ -static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, - struct fiemap_extent_info *fieinfo, +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache) { int ret; @@ -4407,6 +4605,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct fiemap_cache cache = { 0 }; + struct ulist *roots; + struct ulist *tmp_ulist; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4420,6 +4620,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!roots || !tmp_ulist) { + ret = -ENOMEM; + goto out_free_ulist; + } + start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -4430,8 +4637,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { - btrfs_free_path(path); - return ret; + goto out_free_ulist; } else { WARN_ON(!ret); if (ret == 1) @@ -4540,7 +4746,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ ret = btrfs_check_shared(root, btrfs_ino(BTRFS_I(inode)), - bytenr); + bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; if (ret) @@ -4580,12 +4786,16 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } out_free: if (!ret) - ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); + ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + +out_free_ulist: + btrfs_free_path(path); + ulist_free(roots); + ulist_free(tmp_ulist); return ret; } @@ -4672,13 +4882,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); - atomic_set(&eb->write_locks, 0); - atomic_set(&eb->read_locks, 0); atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->spinning_writers, 0); - eb->lock_nested = 0; + eb->blocking_writers = 0; + eb->lock_nested = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -4695,6 +4901,13 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, > MAX_INLINE_EXTENT_BUFFER_SIZE); BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); +#ifdef CONFIG_BTRFS_DEBUG + eb->spinning_writers = 0; + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->read_locks, 0); + eb->write_locks = 0; +#endif + return eb; } @@ -4861,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); @@ -5183,8 +5398,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) { int i; struct page *page; @@ -5196,6 +5410,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; + struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -5746,13 +5961,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu dst len %lu", src_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset + len > dst->len) { btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu dst len %lu", dst_offset, len, dst->len); - BUG_ON(1); + BUG(); } while (len > 0) { @@ -5793,13 +6008,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu len %lu", src_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset + len > dst->len) { btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu len %lu", dst_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 08749e0b9c32..a8551a1f56e2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,28 +7,6 @@ #include <linux/refcount.h> #include "ulist.h" -/* bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_WRITEBACK (1U << 1) -#define EXTENT_UPTODATE (1U << 2) -#define EXTENT_LOCKED (1U << 3) -#define EXTENT_NEW (1U << 4) -#define EXTENT_DELALLOC (1U << 5) -#define EXTENT_DEFRAG (1U << 6) -#define EXTENT_BOUNDARY (1U << 9) -#define EXTENT_NODATASUM (1U << 10) -#define EXTENT_CLEAR_META_RESV (1U << 11) -#define EXTENT_NEED_WAIT (1U << 12) -#define EXTENT_DAMAGED (1U << 13) -#define EXTENT_NORESERVE (1U << 14) -#define EXTENT_QGROUP_RESERVED (1U << 15) -#define EXTENT_CLEAR_DATA_RESV (1U << 16) -#define EXTENT_DELALLOC_NEW (1U << 17) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ - EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) - /* * flags for bio submission. The high bits indicate the compression * type for this bio @@ -82,15 +60,11 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct extent_state; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; - -typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset); +struct extent_io_tree; typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); @@ -100,37 +74,13 @@ struct extent_io_ops { * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ - extent_submit_bio_hook_t *submit_bio_hook; + blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); }; -struct extent_io_tree { - struct rb_root state; - void *private_data; - u64 dirty_bytes; - int track_uptodate; - spinlock_t lock; - const struct extent_io_ops *ops; -}; - -struct extent_state { - u64 start; - u64 end; /* inclusive */ - struct rb_node rb_node; - - /* ADD NEW ELEMENTS AFTER THIS */ - wait_queue_head_t wq; - refcount_t refs; - unsigned state; - - struct io_failure_record *failrec; - -#ifdef CONFIG_BTRFS_DEBUG - struct list_head leak_list; -#endif -}; #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) @@ -146,14 +96,9 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; - /* count of read lock holders on the extent buffer */ - atomic_t write_locks; - atomic_t read_locks; - atomic_t blocking_writers; + int blocking_writers; atomic_t blocking_readers; - atomic_t spinning_readers; - atomic_t spinning_writers; - short lock_nested; + bool lock_nested; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -171,6 +116,10 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG + int spinning_writers; + atomic_t spinning_readers; + atomic_t read_locks; + int write_locks; struct list_head leak_list; #endif }; @@ -239,145 +188,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, NULL); -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int __init extent_io_init(void); -void __cold extent_io_exit(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, unsigned bits, int contig); - -void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, - struct extent_state *cached_state); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask, - struct extent_changeset *changeset); - -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); -} - -static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_NOFS, NULL); -} - -static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, - u64 start, u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_ATOMIC, NULL); -} - -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); -} - -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask); - -static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); -} - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, GFP_NOFS, NULL); -} - -static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, cached); -} - -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, - struct extent_state **cached_state); - -static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, unsigned int extra_bits, - struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - NULL, cached_state, GFP_NOFS); -} - -static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, GFP_NOFS); -} - -static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, - GFP_NOFS); -} - -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} - -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, - struct extent_state **cached_state); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -405,8 +220,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); @@ -416,11 +230,6 @@ static inline int num_extent_pages(const struct extent_buffer *eb) (eb->start >> PAGE_SHIFT); } -static inline void extent_buffer_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->refs); -} - static inline int extent_buffer_uptodate(struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -468,10 +277,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb, void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned bits_to_clear, - unsigned long page_ops); -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); + struct page *locked_page, + unsigned bits_to_clear, + unsigned long page_ops); +struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); @@ -482,13 +291,8 @@ struct btrfs_inode; int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int repair_eb_io_failure(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int mirror_num); +int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@ -510,25 +314,17 @@ struct io_failure_record { }; -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, - u64 end); -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret); bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, +bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); - #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 928f729c55ba..6f417ff68980 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -4,6 +4,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include "ctree.h" +#include "volumes.h" #include "extent_map.h" #include "compression.h" @@ -213,9 +214,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) ASSERT(next->block_start != EXTENT_MAP_DELALLOC && prev->block_start != EXTENT_MAP_DELALLOC); + if (prev->map_lookup || next->map_lookup) + ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && + test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); + if (extent_map_end(prev) == next->start && prev->flags == next->flags && - prev->bdev == next->bdev && + prev->map_lookup == next->map_lookup && ((next->block_start == EXTENT_MAP_HOLE && prev->block_start == EXTENT_MAP_HOLE) || (next->block_start == EXTENT_MAP_INLINE && @@ -337,6 +342,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, try_merge_map(tree, em); } +static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bits_nowait(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, bits); + } +} + +static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, bits, + 0, 0, NULL, GFP_NOWAIT, NULL); + } +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in @@ -352,11 +388,17 @@ int add_extent_mapping(struct extent_map_tree *tree, { int ret = 0; + lockdep_assert_held_write(&tree->lock); + ret = tree_insert(&tree->map, em); if (ret) goto out; setup_extent_mapping(tree, em, modified); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { + extent_map_device_set_bits(em, CHUNK_ALLOCATED); + extent_map_device_clear_bits(em, CHUNK_TRIMMED); + } out: return ret; } @@ -438,6 +480,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 473f039fcd7c..8e217337dff9 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -42,15 +42,8 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - union { - struct block_device *bdev; - - /* - * used for chunk mappings - * flags & EXTENT_FLAG_FS_MAPPING must be set - */ - struct map_lookup *map_lookup; - }; + /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ + struct map_lookup *map_lookup; refcount_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index cccc75d15970..b1bfdc5c1387 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,6 +8,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -22,9 +23,13 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ - sizeof(struct btrfs_ordered_sum)) / \ - sizeof(u32) * (fs_info)->sectorsize) +static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, + u16 csum_size) +{ + u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + + return ncsums * fs_info->sectorsize; +} int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u32 *dst, int dio) + u64 logical_offset, u8 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } csum = btrfs_bio->csum; } else { - csum = (u8 *)dst; + csum = dst; } if (bio->bi_iter.bi_size > PAGE_SIZE * 8) @@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio if (!dio) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - (u32 *)csum, nblocks); + csum, nblocks); if (count) goto found; @@ -283,7 +288,8 @@ next: return 0; } -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } @@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(fs_info)); + max_ordered_sum_bytes(fs_info, csum_size)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -413,10 +419,21 @@ fail: return ret; } +/* + * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio + * @inode: Owner of the data inside the bio + * @bio: Contains the data to be checksummed + * @file_start: offset in file this bio begins to describe + * @contig: Boolean. If true/1 means all bio vecs in this bio are + * contiguous and they begin at @file_start in the file. False/0 + * means this bio can contains potentially discontigous bio vecs + * so the logical offset of each should be calculated separately. + */ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; @@ -429,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -449,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; + shash->tfm = fs_info->csum_shash; + bio_for_each_segment(bvec, bio, iter) { if (!contig) offset = page_offset(bvec.bv_page) + bvec.bv_offset; @@ -458,8 +478,6 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec.bv_page); - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); @@ -469,10 +487,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, offset < ordered->file_offset) { unsigned long bytes_left; - kunmap_atomic(data); sums->len = this_sum_bytes; this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); bytes_left = bio->bi_iter.bi_size - total_bytes; @@ -489,28 +506,24 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; - - data = kmap_atomic(bvec.bv_page); } - sums->sums[index] = ~(u32)0; - sums->sums[index] - = btrfs_csum_data(data + bvec.bv_offset - + (i * fs_info->sectorsize), - sums->sums[index], - fs_info->sectorsize); - btrfs_csum_final(sums->sums[index], - (char *)(sums->sums + index)); - index++; + crypto_shash_init(shash); + data = kmap_atomic(bvec.bv_page); + crypto_shash_update(shash, data + bvec.bv_offset + + (i * fs_info->sectorsize), + fs_info->sectorsize); + kunmap_atomic(data); + crypto_shash_final(shash, (char *)(sums->sums + index)); + index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; } - kunmap_atomic(data); } this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); return 0; } @@ -551,7 +564,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(fs_info, path, new_size, 1); + btrfs_truncate_item(path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -563,7 +576,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(fs_info, path, new_size, 0); + btrfs_truncate_item(path, new_size, 0); key->offset = end_byte; btrfs_set_item_key_safe(fs_info, path, key); @@ -577,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) + struct btrfs_root *root, u64 bytenr, u64 len) { - struct btrfs_root *root = fs_info->csum_root; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; @@ -589,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int blocksize_bits = fs_info->sb->s_blocksize_bits; + ASSERT(root == fs_info->csum_root || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -832,11 +848,11 @@ again: u32 diff; u32 free_space; - if (btrfs_leaf_free_space(fs_info, leaf) < + if (btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item) + csum_size * 2) goto insert; - free_space = btrfs_leaf_free_space(fs_info, leaf) - + free_space = btrfs_leaf_free_space(leaf) - sizeof(struct btrfs_item) - csum_size; tmp = sums->len - total_bytes; tmp >>= fs_info->sb->s_blocksize_bits; @@ -852,7 +868,7 @@ again: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(fs_info, path, diff); + btrfs_extend_item(path, diff); ret = 0; goto csum; } @@ -898,9 +914,9 @@ found: write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, ins_size); + index += ins_size; ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { @@ -932,7 +948,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); - em->bdev = fs_info->fs_devices->latest_bdev; btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 34fe8a58b0e9..8d47c76b7bd1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -26,6 +26,7 @@ #include "volumes.h" #include "qgroup.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -295,7 +296,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.objectid = defrag->ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; @@ -536,8 +537,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * we can set things up properly */ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached); if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && @@ -558,7 +559,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, } err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - extra_bits, cached, 0); + extra_bits, cached); if (err) return err; @@ -666,7 +667,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, } split->generation = gen; - split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; replace_extent_mapping(em_tree, em, split, modified); @@ -679,7 +679,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, split->start = start + len; split->len = em->start + em->len - (start + len); - split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; @@ -754,6 +753,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(BTRFS_I(inode)); @@ -909,11 +909,14 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, num_bytes, 0, + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, start - extent_offset); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -993,11 +996,14 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, root, - disk_bytenr, num_bytes, 0, + btrfs_init_generic_ref(&ref, + BTRFS_DROP_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, - key.objectid, key.offset - - extent_offset); + key.objectid, + key.offset - extent_offset); + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1025,7 +1031,7 @@ delete_extent_item: continue; } - BUG_ON(1); + BUG(); } if (!ret && del_nr > 0) { @@ -1050,7 +1056,7 @@ delete_extent_item: if (!ret && replace_extent && leafs_visited == 1 && (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || path->locks[0] == BTRFS_WRITE_LOCK) && - btrfs_leaf_free_space(fs_info, leaf) >= + btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + extent_item_size) { key.objectid = ino; @@ -1142,6 +1148,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 bytenr; @@ -1287,9 +1294,11 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, + orig_offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1311,6 +1320,9 @@ again: other_start = end; other_end = 0; + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1321,9 +1333,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1341,9 +1351,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1541,30 +1549,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; int ret; ret = btrfs_start_write_no_snapshotting(root); if (!ret) - return -ENOSPC; + return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - while (1) { - lock_extent(&inode->io_tree, lockstart, lockend); - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (!ordered) { - break; - } - unlock_extent(&inode->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + lockend, NULL); num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, @@ -1591,7 +1589,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; @@ -1611,6 +1608,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), @@ -1636,6 +1634,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, break; } + only_release_metadata = false; sector_offset = pos & (fs_info->sectorsize - 1); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); @@ -1692,7 +1691,7 @@ again: force_page_uptodate); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); break; } @@ -1704,7 +1703,7 @@ again: if (extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); ret = extents_locked; break; } @@ -1758,11 +1757,21 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, &cached_state); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - true); + else + free_extent_state(cached_state); + + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -1781,7 +1790,6 @@ again: set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, NULL, GFP_NOFS); - only_release_metadata = false; } btrfs_drop_pages(pages, num_pages); @@ -1882,10 +1890,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, u64 start_pos; u64 end_pos; ssize_t num_written = 0; - bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + const bool sync = iocb->ki_flags & IOCB_DSYNC; ssize_t err; loff_t pos; - size_t count = iov_iter_count(from); + size_t count; loff_t oldsize; int clean_page = 0; @@ -1893,9 +1901,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } @@ -1906,6 +1915,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } pos = iocb->ki_pos; + count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { /* * We will allocate space in case nodatacow is not set, @@ -2056,13 +2066,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - u64 len; - /* - * The range length can be represented by u64, we have to do the typecasts - * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() - */ - len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); @@ -2089,6 +2093,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. Do this while holding the inode lock, to avoid races + * with other tasks. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2115,8 +2132,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. + * + * Also, the range length can be represented by u64, we have to do the + * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. */ - ret = btrfs_wait_ordered_range(inode, start, len); + ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); if (ret) { up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); @@ -2165,7 +2185,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) inode_unlock(inode); goto out; } - trans->sync = true; ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); if (ret < 0) { @@ -2339,7 +2358,6 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -2428,27 +2446,286 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } +static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_clone_extent_info *clone_info, + const u64 clone_len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int slot; + struct btrfs_ref ref = { 0 }; + u64 ref_offset; + int ret; + + if (clone_len == 0) + return 0; + + if (clone_info->disk_offset == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_info->file_offset; + ret = btrfs_insert_empty_item(trans, root, path, &key, + clone_info->item_size); + if (ret) + return ret; + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, clone_info->extent_buf, + btrfs_item_ptr_offset(leaf, slot), + clone_info->item_size); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + /* If it's a hole, nothing more needs to be done. */ + if (clone_info->disk_offset == 0) + return 0; + + inode_add_bytes(inode, clone_len); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + clone_info->disk_offset, + clone_info->disk_len, 0); + ref_offset = clone_info->file_offset - clone_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + + return ret; +} + +/* + * The respective range must have been previously locked, as well as the inode. + * The end offset is inclusive (last byte of the range). + * @clone_info is NULL for fallocate's hole punching and non-NULL for extent + * cloning. + * When cloning, we don't want to end up in a state where we dropped extents + * without inserting a new one, so we must abort the transaction to avoid a + * corruption. + */ +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_block_rsv *rsv; + unsigned int rsv_count; + u64 cur_offset; + u64 drop_end; + u64 len = end - start; + int ret = 0; + + if (end <= start) + return -EINVAL; + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out; + } + rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set or if we are cloning + * an extent + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + rsv_count = 3; + else + rsv_count = 2; + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + min_size, false); + BUG_ON(ret); + trans->block_rsv = rsv; + + cur_offset = start; + while (cur_offset < end) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, end + 1, &drop_end, + 1, 0, 0, NULL); + if (ret != -ENOSPC) { + /* + * When cloning we want to avoid transaction aborts when + * nothing was done and we are attempting to clone parts + * of inline extents, in such cases -EOPNOTSUPP is + * returned by __btrfs_drop_extents() without having + * changed anything in the file. + */ + if (clone_info && ret && ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, ret); + break; + } + + trans->block_rsv = &fs_info->trans_block_rsv; + + if (!clone_info && cur_offset < drop_end && + cur_offset < ino_size) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (clone_info && drop_end > clone_info->file_offset) { + u64 clone_len = drop_end - clone_info->file_offset; + + ret = btrfs_insert_clone_extent(trans, inode, path, + clone_info, clone_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + clone_info->data_len -= clone_len; + clone_info->data_offset += clone_len; + clone_info->file_offset += clone_len; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + break; + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, + rsv, min_size, false); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + + if (!clone_info) { + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + } + + /* + * If we were cloning, force the next fsync to be a full one since we + * we replaced (or just dropped in the case of cloning holes when + * NO_HOLES is enabled) extents and extent maps. + * This is for the sake of simplicity, and cloning into files larger + * than 16Mb would force the full fsync any way (when + * try_release_extent_mapping() is invoked during page cache truncation. + */ + if (clone_info) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + if (ret) + goto out_trans; + + trans->block_rsv = &fs_info->trans_block_rsv; + /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= end) + drop_end = end + 1; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + if (clone_info) { + ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, + clone_info->data_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &fs_info->trans_block_rsv; + if (ret) + btrfs_end_transaction(trans); + else + *trans_out = trans; +out_free: + btrfs_free_block_rsv(fs_info, rsv); +out: + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_block_rsv *rsv; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; u64 lockstart; u64 lockend; u64 tail_start; u64 tail_len; u64 orig_start = offset; - u64 cur_offset; - u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 drop_end; int ret = 0; - int err = 0; - unsigned int rsv_count; bool same_block; - bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES); u64 ino_size; bool truncated_block = false; bool updated_inode = false; @@ -2546,10 +2823,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); - if (ret) { - inode_unlock(inode); + if (ret) goto out_only_mutex; - } path = btrfs_alloc_path(); if (!path) { @@ -2557,145 +2832,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out_free; - } - rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1); - rsv->failfast = 1; - - /* - * 1 - update the inode - * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set - */ - rsv_count = no_holes ? 2 : 3; - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, false); - BUG_ON(ret); - trans->block_rsv = rsv; - - cur_offset = lockstart; - len = lockend - cur_offset; - while (cur_offset < lockend) { - ret = __btrfs_drop_extents(trans, root, inode, path, - cur_offset, lockend + 1, - &drop_end, 1, 0, 0, NULL); - if (ret != -ENOSPC) - break; - - trans->block_rsv = &fs_info->trans_block_rsv; - - if (cur_offset < drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* - * If we failed then we didn't insert our hole - * entries for the area we dropped, so now the - * fs is corrupted, so we must abort the - * transaction. - */ - btrfs_abort_transaction(trans, ret); - err = ret; - break; - } - } - - cur_offset = drop_end; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; - break; - } - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); - - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - break; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ - trans->block_rsv = rsv; - - ret = find_first_non_hole(inode, &cur_offset, &len); - if (unlikely(ret < 0)) - break; - if (ret && !len) { - ret = 0; - break; - } - } - - if (ret) { - err = ret; - goto out_trans; - } - - trans->block_rsv = &fs_info->trans_block_rsv; - /* - * If we are using the NO_HOLES feature we might have had already an - * hole that overlaps a part of the region [lockstart, lockend] and - * ends at (or beyond) lockend. Since we have no file extent items to - * represent holes, drop_end can be less than lockend and so we must - * make sure we have an extent map representing the existing hole (the - * call to __btrfs_drop_extents() might have dropped the existing extent - * map representing the existing hole), otherwise the fast fsync path - * will not record the existence of the hole region - * [existing_hole_start, lockend]. - */ - if (drop_end <= lockend) - drop_end = lockend + 1; - /* - * Don't insert file hole extent item if it's for a range beyond eof - * (because it's useless) or if it represents a 0 bytes range (when - * cur_offset == drop_end). - */ - if (cur_offset < ino_size && cur_offset < drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* Same comment as above. */ - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_trans; - } - } - -out_trans: - if (!trans) - goto out_free; + ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + &trans); + btrfs_free_path(path); + if (ret) + goto out; + ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); -out_free: - btrfs_free_path(path); - btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); out_only_mutex: - if (!updated_inode && truncated_block && !ret && !err) { + if (!updated_inode && truncated_block && !ret) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are @@ -2703,18 +2857,25 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ + struct timespec64 now = current_time(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); } else { - err = btrfs_update_inode(trans, root, inode); - ret = btrfs_end_transaction(trans); + int ret2; + + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; } } inode_unlock(inode); - if (ret && !err) - err = ret; - return err; + return ret; } /* Helper structure to record which range is already reserved */ @@ -2783,9 +2944,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode, } enum { - RANGE_BOUNDARY_WRITTEN_EXTENT = 0, - RANGE_BOUNDARY_PREALLOC_EXTENT = 1, - RANGE_BOUNDARY_HOLE = 2, + RANGE_BOUNDARY_WRITTEN_EXTENT, + RANGE_BOUNDARY_PREALLOC_EXTENT, + RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct inode *inode, @@ -3132,6 +3293,7 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, cur_offset, last_byte - cur_offset); if (ret < 0) { + cur_offset = last_byte; free_extent_map(em); break; } @@ -3181,34 +3343,35 @@ out: /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, - alloc_start, alloc_end - cur_offset); + cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } -static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) +static loff_t find_desired_extent(struct inode *inode, loff_t offset, + int whence) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + loff_t i_size = inode->i_size; u64 lockstart; u64 lockend; u64 start; u64 len; int ret = 0; - if (inode->i_size == 0) + if (i_size == 0 || offset >= i_size) return -ENXIO; /* - * *offset can be negative, in this case we start finding DATA/HOLE from + * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ - start = max_t(loff_t, 0, *offset); + start = max_t(loff_t, 0, offset); lockstart = round_down(start, fs_info->sectorsize); - lockend = round_up(i_size_read(inode), - fs_info->sectorsize); + lockend = round_up(i_size, fs_info->sectorsize); if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; @@ -3217,7 +3380,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - while (start < inode->i_size) { + while (start < i_size) { em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3240,46 +3403,39 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) cond_resched(); } free_extent_map(em); - if (!ret) { - if (whence == SEEK_DATA && start >= inode->i_size) - ret = -ENXIO; - else - *offset = min_t(loff_t, start, inode->i_size); - } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - return ret; + if (ret) { + offset = ret; + } else { + if (whence == SEEK_DATA && start >= i_size) + offset = -ENXIO; + else + offset = min_t(loff_t, start, i_size); + } + + return offset; } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - int ret; - inode_lock(inode); switch (whence) { - case SEEK_END: - case SEEK_CUR: - offset = generic_file_llseek(file, offset, whence); - goto out; + default: + return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - if (offset >= i_size_read(inode)) { - inode_unlock(inode); - return -ENXIO; - } - - ret = find_desired_extent(inode, &offset, whence); - if (ret) { - inode_unlock(inode); - return ret; - } + inode_lock_shared(inode); + offset = find_desired_extent(inode, offset, whence); + inode_unlock_shared(inode); + break; } - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); -out: - inode_unlock(inode); - return offset; + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } static int btrfs_file_open(struct inode *inode, struct file *filp) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 74aa552f4793..3283da419200 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,9 @@ #include "extent_io.h" #include "inode-map.h" #include "volumes.h" +#include "space-info.h" +#include "delalloc-space.h" +#include "block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -75,7 +78,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); + inode = btrfs_iget_path(fs_info->sb, &location, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -88,10 +91,10 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return inode; } -struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path) +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, + struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; @@ -103,7 +106,7 @@ struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, return inode; inode = __lookup_free_space_inode(fs_info->tree_root, path, - block_group->key.objectid); + block_group->start); if (IS_ERR(inode)) return inode; @@ -185,20 +188,19 @@ static int __create_free_space_inode(struct btrfs_root *root, return 0; } -int create_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +int create_free_space_inode(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; u64 ino; - ret = btrfs_find_free_objectid(fs_info->tree_root, &ino); + ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; - return __create_free_space_inode(fs_info->tree_root, trans, path, ino, - block_group->key.objectid); + return __create_free_space_inode(trans->fs_info->tree_root, trans, path, + ino, block_group->start); } int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, @@ -208,8 +210,8 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int ret; /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) + - btrfs_calc_trans_metadata_size(fs_info, 1); + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); spin_lock(&rsv->lock); if (rsv->reserved < needed_bytes) @@ -221,7 +223,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -382,6 +384,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); + if (page->mapping != inode->i_mapping) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "free space cache page truncated"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } if (!PageUptodate(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); @@ -465,9 +473,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -493,9 +500,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); @@ -764,7 +770,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } else { ASSERT(num_bitmaps); num_bitmaps--; - e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + e->bitmap = kmem_cache_zalloc( + btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { kmem_cache_free( btrfs_free_space_cachep, e); @@ -812,15 +819,15 @@ free_cache: goto out; } -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) +int load_free_space_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; struct btrfs_path *path; int ret = 0; bool matched; - u64 used = btrfs_block_group_used(&block_group->item); + u64 used = block_group->used; /* * If this block group has been marked to be cleared for one reason or @@ -858,7 +865,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, * once created get their ->cached field set to BTRFS_CACHE_FINISHED so * we will never try to read their inode item while the fs is mounted. */ - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; @@ -874,13 +881,13 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_unlock(&block_group->lock); ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, - path, block_group->key.objectid); + path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->key.offset - used - + matched = (ctl->free_space == (block_group->length - used - block_group->bytes_super)); spin_unlock(&ctl->tree_lock); @@ -888,7 +895,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(ctl); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", - block_group->key.objectid); + block_group->start); ret = -1; } out: @@ -901,7 +908,7 @@ out: btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", - block_group->key.objectid); + block_group->start); } iput(inode); @@ -911,7 +918,7 @@ out: static noinline_for_stack int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, int *entries, int *bitmaps, struct list_head *bitmap_list) { @@ -1004,7 +1011,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1016,9 +1023,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL); + inode->i_size - 1, EXTENT_DELALLOC, 0, + 0, NULL); btrfs_release_path(path); goto fail; } @@ -1039,9 +1045,8 @@ fail: return -1; } -static noinline_for_stack int -write_pinned_extent_entries(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) { @@ -1059,11 +1064,11 @@ write_pinned_extent_entries(struct btrfs_fs_info *fs_info, * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = fs_info->pinned_extents; + unpin = block_group->fs_info->pinned_extents; - start = block_group->key.objectid; + start = block_group->start; - while (start < block_group->key.objectid + block_group->key.offset) { + while (start < block_group->start + block_group->length) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); @@ -1071,13 +1076,12 @@ write_pinned_extent_entries(struct btrfs_fs_info *fs_info, return 0; /* This pinned extent is out of our range */ - if (extent_start >= block_group->key.objectid + - block_group->key.offset) + if (extent_start >= block_group->start + block_group->length) return 0; extent_start = max(extent_start, start); - extent_end = min(block_group->key.objectid + - block_group->key.offset, extent_end + 1); + extent_end = min(block_group->start + block_group->length, + extent_end + 1); len = extent_end - extent_start; *entries += 1; @@ -1115,7 +1119,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1141,7 +1145,7 @@ cleanup_write_cache_enospc(struct inode *inode, static int __btrfs_wait_cache_io(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path, u64 offset) { @@ -1168,7 +1172,7 @@ out: #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", - block_group->key.objectid); + block_group->start); #endif } } @@ -1210,12 +1214,12 @@ static int btrfs_wait_cache_io_root(struct btrfs_root *root, } int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, block_group, &block_group->io_ctl, - path, block_group->key.objectid); + path, block_group->start); } /** @@ -1231,11 +1235,10 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1293,8 +1296,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(fs_info, block_group, - io_ctl, &entries); + ret = write_pinned_extent_entries(block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1370,11 +1372,11 @@ out_unlock: goto out; } -int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; @@ -1386,7 +1388,7 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, } spin_unlock(&block_group->lock); - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) return 0; @@ -1396,7 +1398,7 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, #ifdef DEBUG btrfs_err(fs_info, "failed to write free space cache for block group %llu", - block_group->key.objectid); + block_group->start); #endif spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; @@ -1649,11 +1651,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; - u64 size = block_group->key.offset; + u64 size = block_group->length; u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); @@ -1884,7 +1886,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { unlink_free_space(ctl, bitmap_info); - kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); @@ -1993,7 +1995,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2030,7 +2032,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * so allow those block groups to still be allowed to have a bitmap * entry. */ - if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) return false; return true; @@ -2045,7 +2047,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; - struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; int ret; @@ -2138,7 +2140,8 @@ new_bitmap: } /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, + GFP_NOFS); spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2149,7 +2152,9 @@ new_bitmap: out: if (info) { - kfree(info->bitmap); + if (info->bitmap) + kmem_cache_free(btrfs_free_space_bitmap_cachep, + info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -2379,7 +2384,15 @@ out: return ret; } -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size); +} + +int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2469,7 +2482,7 @@ out: return ret; } -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, +void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2494,14 +2507,14 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, "%d blocks of free space at or bigger than bytes is", count); } -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; - ctl->start = block_group->key.objectid; + ctl->start = block_group->start; ctl->private = block_group; ctl->op = &free_space_op; INIT_LIST_HEAD(&ctl->trimming_ranges); @@ -2523,7 +2536,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) */ static int __btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2589,7 +2602,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) spin_unlock(&ctl->tree_lock); } -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; @@ -2611,7 +2624,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) } -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { @@ -2665,7 +2678,7 @@ out: * cluster and remove the cluster from it. */ int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; @@ -2699,7 +2712,7 @@ int btrfs_return_cluster_to_free_space( return ret; } -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, u64 bytes, u64 min_start, @@ -2732,7 +2745,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, * if it couldn't find anything suitably large, or a logical disk offset * if things worked out */ -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size) { @@ -2805,7 +2818,8 @@ out: if (entry->bytes == 0) { ctl->free_extents--; if (entry->bitmap) { - kfree(entry->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, + entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); } @@ -2817,7 +2831,7 @@ out: return ret; } -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, +static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, @@ -2899,7 +2913,7 @@ again: * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_no_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -2990,7 +3004,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * that we have already failed to find extents that will work. */ static noinline int -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -3040,11 +3054,11 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); @@ -3131,7 +3145,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -static int do_trimming(struct btrfs_block_group_cache *block_group, +static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, struct btrfs_trim_range *trim_entry) @@ -3169,14 +3183,14 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); } return ret; } -static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, +static int trim_no_bitmap(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3261,7 +3275,7 @@ out: return ret; } -static int trim_bitmaps(struct btrfs_block_group_cache *block_group, +static int trim_bitmaps(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3342,12 +3356,12 @@ next: return ret; } -void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group_trimming(struct btrfs_block_group *cache) { atomic_inc(&cache->trimming); } -void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) +void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_map_tree *em_tree; @@ -3361,15 +3375,11 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) if (cleanup) { mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->key.objectid, + em = lookup_extent_mapping(em_tree, block_group->start, 1); BUG_ON(!em); /* logic error, can't happen */ - /* - * remove_extent_mapping() will delete us from the pinned_chunks - * list, which is protected by the chunk mutex. - */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -3386,7 +3396,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) } } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { int ret; @@ -3584,7 +3594,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, * how the free space cache loading stuff works, so you can get really weird * configurations. */ -int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; @@ -3613,7 +3623,7 @@ again: } if (!map) { - map = kzalloc(PAGE_SIZE, GFP_NOFS); + map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; @@ -3642,7 +3652,8 @@ again: if (info) kmem_cache_free(btrfs_free_space_cachep, info); - kfree(map); + if (map) + kmem_cache_free(btrfs_free_space_bitmap_cachep, map); return 0; } @@ -3651,7 +3662,7 @@ again: * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ -int test_check_exists(struct btrfs_block_group_cache *cache, +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 15e30b93db0d..ba9a23241101 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -36,29 +36,37 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; -struct btrfs_io_ctl; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_fs_info *fs_info; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; -struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path); -int create_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, + struct btrfs_path *path); +int create_free_space_inode(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path); int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode); -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group); +int load_free_space_cache(struct btrfs_block_group *block_group); int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); -int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path); struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); @@ -72,49 +80,40 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode); -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size); -static inline int -btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size) -{ - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size); -} -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, +void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size); int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster); -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap); -int test_check_exists(struct btrfs_block_group_cache *cache, - u64 offset, u64 bytes); +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes); #endif #endif diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e5089087eaa6..258cb3fae17a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -10,12 +10,13 @@ #include "locking.h" #include "free-space-tree.h" #include "transaction.h" +#include "block-group.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); -void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; @@ -26,8 +27,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) * exceeds that required for using bitmaps. */ bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; - num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, - bitmap_range); + num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range); bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; total_bitmap_size = num_bitmaps * bitmap_size; cache->bitmap_high_thresh = div_u64(total_bitmap_size, @@ -44,7 +44,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) } static int add_new_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_root *root = trans->fs_info->free_space_root; @@ -53,9 +53,9 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; - key.offset = block_group->key.offset; + key.offset = block_group->length; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) @@ -76,24 +76,25 @@ out: EXPORT_FOR_TESTS struct btrfs_free_space_info *search_free_space_info( - struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; int ret; - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; - key.offset = block_group->key.offset; + key.offset = block_group->length; ret = btrfs_search_slot(trans, root, &key, path, 0, cow); if (ret < 0) return ERR_PTR(ret); if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", - block_group->key.objectid); + block_group->start); ASSERT(0); return ERR_PTR(-ENOENT); } @@ -178,7 +179,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -195,7 +196,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->key.offset, + bitmap_size = free_space_bitmap_size(block_group->length, fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { @@ -203,8 +204,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -222,8 +223,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { @@ -253,7 +254,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -269,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -318,7 +319,7 @@ out: EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -334,7 +335,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->key.offset, + bitmap_size = free_space_bitmap_size(block_group->length, fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { @@ -342,8 +343,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -361,8 +362,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { @@ -398,7 +399,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -411,7 +412,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize); + nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize); start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { @@ -435,7 +436,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -451,7 +452,7 @@ out: } static int update_free_space_extent_count(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, int new_extents) { @@ -463,8 +464,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, trans->fs_info, block_group, path, - 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -490,7 +490,7 @@ out: } EXPORT_FOR_TESTS -int free_space_test_bit(struct btrfs_block_group_cache *block_group, +int free_space_test_bit(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; @@ -512,7 +512,7 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group, return !!extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_block_group_cache *block_group, +static void free_space_set_bits(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { @@ -580,7 +580,7 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, * the bitmap. */ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size, int remove) { @@ -596,7 +596,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately before the extent of space if * that block is within the block group. */ - if (start > block_group->key.objectid) { + if (start > block_group->start) { u64 prev_block = start - block_group->fs_info->sectorsize; key.objectid = prev_block; @@ -648,7 +648,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately after the extent of space if * that block is within the block group. */ - if (end < block_group->key.objectid + block_group->key.offset) { + if (end < block_group->start + block_group->length) { /* The next block may be in the next bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (end >= key.objectid + key.offset) { @@ -693,7 +693,7 @@ out: } static int remove_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -780,7 +780,7 @@ out: EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_free_space_info *info; @@ -793,8 +793,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, return ret; } - info = search_free_space_info(NULL, trans->fs_info, block_group, path, - 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -812,7 +811,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, int remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; @@ -846,7 +845,7 @@ out: } static int add_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -880,7 +879,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, new_key.offset = size; /* Search for a neighbor on the left. */ - if (start == block_group->key.objectid) + if (start == block_group->start) goto right; key.objectid = start - 1; key.type = (u8)-1; @@ -900,8 +899,8 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, found_start = key.objectid; found_end = key.objectid + key.offset; - ASSERT(found_start >= block_group->key.objectid && - found_end > block_group->key.objectid); + ASSERT(found_start >= block_group->start && + found_end > block_group->start); ASSERT(found_start < start && found_end <= start); /* @@ -920,7 +919,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, right: /* Search for a neighbor on the right. */ - if (end == block_group->key.objectid + block_group->key.offset) + if (end == block_group->start + block_group->length) goto insert; key.objectid = end; key.type = (u8)-1; @@ -940,8 +939,8 @@ right: found_start = key.objectid; found_end = key.objectid + key.offset; - ASSERT(found_start >= block_group->key.objectid && - found_end > block_group->key.objectid); + ASSERT(found_start >= block_group->start && + found_end > block_group->start); ASSERT((found_start < start && found_end <= start) || (found_start >= end && found_end > end)); @@ -974,10 +973,9 @@ out: EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_info *info; u32 flags; int ret; @@ -988,7 +986,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, return ret; } - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1006,7 +1004,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, int add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; @@ -1044,7 +1042,7 @@ out: * through the normal add/remove hooks. */ static int populate_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_root *extent_root = trans->fs_info->extent_root; struct btrfs_path *path, *path2; @@ -1076,7 +1074,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's * contained in. */ - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; @@ -1085,8 +1083,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, goto out_locked; ASSERT(ret == 0); - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; while (1) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -1110,7 +1108,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - if (key.objectid != block_group->key.objectid) + if (key.objectid != block_group->start) break; } @@ -1141,7 +1139,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *free_space_root; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct rb_node *node; int ret; @@ -1150,7 +1148,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - free_space_root = btrfs_create_tree(trans, fs_info, + free_space_root = btrfs_create_tree(trans, BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); @@ -1160,7 +1158,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) node = rb_first(&fs_info->block_group_cache_tree); while (node) { - block_group = rb_entry(node, struct btrfs_block_group_cache, + block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); if (ret) @@ -1248,7 +1246,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - clean_tree_block(fs_info, free_space_root->node); + btrfs_clean_tree_block(free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); @@ -1266,7 +1264,7 @@ abort: } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; @@ -1278,12 +1276,12 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, return ret; return __add_to_free_space_tree(trans, block_group, path, - block_group->key.objectid, - block_group->key.offset); + block_group->start, + block_group->length); } int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path = NULL; @@ -1313,7 +1311,7 @@ out: } int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_path *path; @@ -1337,8 +1335,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -1356,8 +1354,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; nr++; path->slots[0]--; @@ -1392,7 +1390,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; @@ -1408,7 +1406,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, fs_info = block_group->fs_info; root = fs_info->free_space_root; - end = block_group->key.objectid + block_group->key.offset; + end = block_group->start + block_group->length; while (1) { ret = btrfs_next_item(root, path); @@ -1455,7 +1453,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -1473,7 +1471,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; @@ -1486,7 +1484,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, fs_info = block_group->fs_info; root = fs_info->free_space_root; - end = block_group->key.objectid + block_group->key.offset; + end = block_group->start + block_group->length; while (1) { ret = btrfs_next_item(root, path); @@ -1517,7 +1515,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -1533,15 +1531,13 @@ out: int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; + struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; struct btrfs_path *path; u32 extent_count, flags; int ret; block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; path = btrfs_alloc_path(); if (!path) @@ -1555,7 +1551,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->search_commit_root = 1; path->reada = READA_FORWARD; - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 3133651d7d70..dc2463e4cfe3 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +struct btrfs_caching_control; + /* * The default size for new free space bitmap items. The last bitmap in a block * group may be truncated, and none of the free space tree code assumes that @@ -14,14 +16,14 @@ #define BTRFS_FREE_SPACE_BITMAP_SIZE 256 #define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) -void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group); + struct btrfs_block_group *block_group); int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group); + struct btrfs_block_group *block_group); int add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); int remove_from_free_space_tree(struct btrfs_trans_handle *trans, @@ -30,22 +32,21 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, int cow); int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); -int free_space_test_bit(struct btrfs_block_group_cache *block_group, +int free_space_test_bit(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 offset); #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a8956a3c9e05..668701832845 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -8,9 +8,9 @@ #include "transaction.h" #include "print-tree.h" -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -28,19 +28,15 @@ int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, cur_offset += len + sizeof(*ref); if (len != name_len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - if (ref_ret) - *ref_ret = ref; - return 1; - } + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return ref; } - return 0; + return NULL; } -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, - const char *name, int name_len, - struct btrfs_inode_extref **extref_ret) +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -65,15 +61,12 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, if (ref_name_len == name_len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { - if (extref_ret) - *extref_ret = extref; - return 1; - } + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + return extref; cur_offset += ref_name_len + sizeof(*extref); } - return 0; + return NULL; } /* Returns NULL if no extref found */ @@ -87,7 +80,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct btrfs_inode_extref *extref; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -98,11 +90,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len, - &extref)) - return NULL; - return extref; + return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, @@ -142,9 +132,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, name_len, &extref)) { + extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; @@ -170,7 +160,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1); + btrfs_truncate_item(path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -213,8 +203,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) { + + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, + name_len); + if (!ref) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -234,7 +226,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1); + btrfs_truncate_item(path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -285,10 +277,10 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len, NULL)) + name, name_len)) goto out; - btrfs_extend_item(root->fs_info, path, ins_len); + btrfs_extend_item(path, ins_len); ret = 0; } if (ret < 0) @@ -341,13 +333,13 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ins_len); if (ret == -EEXIST) { u32 old_size; - - if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len); + if (ref) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(fs_info, path, ins_len); + btrfs_extend_item(path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -359,7 +351,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + name, name_len)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ffca2abf13d0..37345fb6191d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -11,6 +11,20 @@ #include "free-space-cache.h" #include "inode-map.h" #include "transaction.h" +#include "delalloc-space.h" + +static void fail_caching_thread(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + btrfs_warn(fs_info, "failed to start inode caching task"); + btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, + "disabling inode map caching"); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_ERROR; + spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); +} static int caching_kthread(void *data) { @@ -28,8 +42,10 @@ static int caching_kthread(void *data) return 0; path = btrfs_alloc_path(); - if (!path) + if (!path) { + fail_caching_thread(root); return -ENOMEM; + } /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; @@ -145,6 +161,7 @@ static void start_caching(struct btrfs_root *root) spin_lock(&root->ino_cache_lock); root->ino_cache_state = BTRFS_CACHE_FINISHED; spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); return; } @@ -159,15 +176,13 @@ static void start_caching(struct btrfs_root *root) if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { __btrfs_add_free_space(fs_info, ctl, objectid, BTRFS_LAST_FREE_OBJECTID - objectid + 1); + wake_up(&root->ino_cache_wait); } tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); - if (IS_ERR(tsk)) { - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - } + if (IS_ERR(tsk)) + fail_caching_thread(root); } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -185,11 +200,14 @@ again: wait_event(root->ino_cache_wait, root->ino_cache_state == BTRFS_CACHE_FINISHED || + root->ino_cache_state == BTRFS_CACHE_ERROR || root->free_ino_ctl->free_space > 0); if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; + else if (root->ino_cache_state == BTRFS_CACHE_ERROR) + return btrfs_find_free_objectid(root, objectid); else goto again; } @@ -418,7 +436,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10); + trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); @@ -483,12 +501,13 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2973608824ec..c70baafb2a39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,7 +28,9 @@ #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -45,7 +47,8 @@ #include "backref.h" #include "props.h" #include "qgroup.h" -#include "dedupe.h" +#include "delalloc-space.h" +#include "block-group.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -72,26 +75,15 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; - -#define S_SHIFT 12 -static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, -}; +struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash); + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -187,6 +179,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; unsigned long offset; + ASSERT((compressed_size > 0 && compressed_pages) || + (compressed_size == 0 && !compressed_pages)); + if (compressed_size && compressed_pages) cur_size = compressed_size; @@ -366,18 +361,25 @@ struct async_extent { struct list_head list; }; -struct async_cow { +struct async_chunk { struct inode *inode; - struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; unsigned int write_flags; struct list_head extents; + struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; + atomic_t *pending; }; -static noinline int add_async_extent(struct async_cow *cow, +struct async_cow { + /* Number of chunks in flight; must be first in the structure */ + atomic_t num_chunks; + struct async_chunk chunks[]; +}; + +static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, @@ -398,10 +400,31 @@ static noinline int add_async_extent(struct async_cow *cow, return 0; } +/* + * Check if the inode has flags compatible with compression + */ +static inline bool inode_can_compress(struct inode *inode) +{ + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW || + BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} + +/* + * Check if the inode needs to be submitted to compression, based on mount + * options, defragmentation, properties or heuristics. + */ static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + if (!inode_can_compress(inode)) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: unexpected compression for ino %llu\n", + btrfs_ino(BTRFS_I(inode))); + return 0; + } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; @@ -444,15 +467,15 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, * are written in the same order that the flusher thread sent them * down. */ -static noinline void compress_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, - struct async_cow *async_cow, - int *num_added) +static noinline int compress_file_range(struct async_chunk *async_chunk) { + struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; + u64 start = async_chunk->start; + u64 end = async_chunk->end; u64 actual_end; + u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -461,12 +484,25 @@ static noinline void compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = fs_info->compress_type; + int compressed_extents = 0; int redirty = 0; inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, i_size_read(inode), end + 1); + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -597,14 +633,21 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, end, - NULL, clear_flags, + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - goto free_pages_out; + + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + + return 0; } } @@ -623,14 +666,14 @@ cont: */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - *num_added += 1; + compressed_extents++; /* * The async work queues will take care of doing actual * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, total_in, + add_async_extent(async_chunk, start, total_in, total_compressed, pages, nr_pages, compress_type); @@ -640,7 +683,7 @@ cont: cond_resched(); goto again; } - return; + return compressed_extents; } } if (pages) { @@ -670,25 +713,20 @@ cleanup_and_bail_uncompressed: * to our extent and set things up for the async work queue to run * cow_file_range to do the normal delalloc dance. */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) - __set_page_dirty_nobuffers(locked_page); + if (async_chunk->locked_page && + (page_offset(async_chunk->locked_page) >= start && + page_offset(async_chunk->locked_page)) <= end) { + __set_page_dirty_nobuffers(async_chunk->locked_page); /* unlocked later on in the async handlers */ + } if (redirty) extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; + compressed_extents++; - return; - -free_pages_out: - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); + return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -713,45 +751,38 @@ static void free_async_extent_pages(struct async_extent *async_extent) * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline void submit_compressed_extents(struct async_cow *async_cow) +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct inode *inode = async_cow->inode; + struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret = 0; again: - while (!list_empty(&async_cow->extents)) { - async_extent = list_entry(async_cow->extents.next, + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); - io_tree = &BTRFS_I(inode)->io_tree; - retry: + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1); /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; unsigned long nr_written = 0; - lock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); - /* allocate blocks */ - ret = cow_file_range(inode, async_cow->locked_page, + ret = cow_file_range(inode, async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0, - NULL); + &page_started, &nr_written, 0); /* JDM XXX */ @@ -767,16 +798,13 @@ retry: async_extent->start + async_extent->ram_size - 1, WB_SYNC_ALL); - else if (ret) - unlock_page(async_cow->locked_page); + else if (ret && async_chunk->locked_page) + unlock_page(async_chunk->locked_page); kfree(async_extent); cond_resched(); continue; } - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, @@ -844,8 +872,6 @@ retry: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); @@ -855,7 +881,8 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages, - async_cow->write_flags)) { + async_chunk->write_flags, + async_chunk->blkcg_css)) { struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -864,7 +891,7 @@ retry: btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); @@ -882,8 +909,6 @@ out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -942,9 +967,8 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash) + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -983,8 +1007,7 @@ static noinline int cow_file_range(struct inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, - delalloc_end, NULL, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1067,7 +1090,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - delalloc_end, locked_page, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1112,7 +1135,6 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size, - start + cur_alloc_size, locked_page, clear_bits, page_ops); @@ -1120,8 +1142,7 @@ out_unlock: if (start >= end) goto out; } - extent_clear_unlock_delalloc(inode, start, end, delalloc_end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1132,16 +1153,15 @@ out_unlock: */ static noinline void async_cow_start(struct btrfs_work *work) { - struct async_cow *async_cow; - int num_added = 0; - async_cow = container_of(work, struct async_cow, work); + struct async_chunk *async_chunk; + int compressed_extents; - compress_file_range(async_cow->inode, async_cow->locked_page, - async_cow->start, async_cow->end, async_cow, - &num_added); - if (num_added == 0) { - btrfs_add_delayed_iput(async_cow->inode); - async_cow->inode = NULL; + async_chunk = container_of(work, struct async_chunk, work); + + compressed_extents = compress_file_range(async_chunk); + if (compressed_extents == 0) { + btrfs_add_delayed_iput(async_chunk->inode); + async_chunk->inode = NULL; } } @@ -1150,14 +1170,12 @@ static noinline void async_cow_start(struct btrfs_work *work) */ static noinline void async_cow_submit(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; - struct async_cow *async_cow; + struct async_chunk *async_chunk = container_of(work, struct async_chunk, + work); + struct btrfs_fs_info *fs_info = btrfs_work_owner(work); unsigned long nr_pages; - async_cow = container_of(work, struct async_cow, work); - - fs_info = async_cow->fs_info; - nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; /* atomic_sub_return implies a barrier */ @@ -1166,69 +1184,139 @@ static noinline void async_cow_submit(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); /* - * ->inode could be NULL if async_cow_start has failed to compress, + * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init * happening in cow_file_range_async */ - if (async_cow->inode) - submit_compressed_extents(async_cow); + if (async_chunk->inode) + submit_compressed_extents(async_chunk); } static noinline void async_cow_free(struct btrfs_work *work) { - struct async_cow *async_cow; - async_cow = container_of(work, struct async_cow, work); - if (async_cow->inode) - btrfs_add_delayed_iput(async_cow->inode); - kfree(async_cow); + struct async_chunk *async_chunk; + + async_chunk = container_of(work, struct async_chunk, work); + if (async_chunk->inode) + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + /* + * Since the pointer to 'pending' is at the beginning of the array of + * async_chunk's, freeing it ensures the whole array has been freed. + */ + if (atomic_dec_and_test(async_chunk->pending)) + kvfree(async_chunk->pending); } -static int cow_file_range_async(struct inode *inode, struct page *locked_page, +static int cow_file_range_async(struct inode *inode, + struct writeback_control *wbc, + struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, - unsigned int write_flags) + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct async_cow *async_cow; + struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); + struct async_cow *ctx; + struct async_chunk *async_chunk; unsigned long nr_pages; u64 cur_end; + u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); + int i; + bool should_compress; + unsigned nofs_flag; + const unsigned int write_flags = wbc_to_write_flags(wbc); + + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { + num_chunks = 1; + should_compress = false; + } else { + should_compress = true; + } + + nofs_flag = memalloc_nofs_save(); + ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + + if (!ctx) { + unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; + unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR; + + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); + return -ENOMEM; + } + + async_chunk = ctx->chunks; + atomic_set(&ctx->num_chunks, num_chunks); + + for (i = 0; i < num_chunks; i++) { + if (should_compress) + cur_end = min(end, start + SZ_512K - 1); + else + cur_end = end; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL); - while (start < end) { - async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); /* -ENOMEM */ /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(inode); - async_cow->inode = inode; - async_cow->fs_info = fs_info; - async_cow->locked_page = locked_page; - async_cow->start = start; - async_cow->write_flags = write_flags; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) - cur_end = end; - else - cur_end = min(end, start + SZ_512K - 1); + async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].inode = inode; + async_chunk[i].start = start; + async_chunk[i].end = cur_end; + async_chunk[i].write_flags = write_flags; + INIT_LIST_HEAD(&async_chunk[i].extents); - async_cow->end = cur_end; - INIT_LIST_HEAD(&async_cow->extents); + /* + * The locked_page comes all the way from writepage and its + * the original page we were actually given. As we spread + * this large delalloc region across multiple async_chunk + * structs, only the first struct needs a pointer to locked_page + * + * This way we don't need racey decisions about who is supposed + * to unlock it. + */ + if (locked_page) { + /* + * Depending on the compressibility, the pages might or + * might not go through async. We want all of them to + * be accounted against wbc once. Let's do it here + * before the paths diverge. wbc accounting is used + * only for foreign writeback detection and doesn't + * need full accuracy. Just account the whole thing + * against the first page. + */ + wbc_account_cgroup_owner(wbc, locked_page, + cur_end - start); + async_chunk[i].locked_page = locked_page; + locked_page = NULL; + } else { + async_chunk[i].locked_page = NULL; + } + + if (blkcg_css != blkcg_root_css) { + css_get(blkcg_css); + async_chunk[i].blkcg_css = blkcg_css; + } else { + async_chunk[i].blkcg_css = NULL; + } - btrfs_init_work(&async_cow->work, - btrfs_delalloc_helper, - async_cow_start, async_cow_submit, - async_cow_free); + btrfs_init_work(&async_chunk[i].work, async_cow_start, + async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_SIZE) >> - PAGE_SHIFT; + nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); - btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); + btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); *nr_written += nr_pages; start = cur_end + 1; @@ -1268,36 +1356,25 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, */ static noinline int run_delalloc_nocow(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, int force, - unsigned long *nr_written) + const u64 start, const u64 end, + int *page_started, int force, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key found_key; - struct extent_map *em; - u64 cow_start; - u64 cur_offset; - u64 extent_end; - u64 extent_offset; - u64 disk_bytenr; - u64 num_bytes; - u64 disk_num_bytes; - u64 ram_bytes; - int extent_type; + u64 cow_start = (u64)-1; + u64 cur_offset = start; int ret; - int type; - int nocow; - int check_prev = 1; - bool nolock; + bool check_prev = true; + const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); u64 ino = btrfs_ino(BTRFS_I(inode)); + bool nocow = false; + u64 disk_bytenr = 0; path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1307,15 +1384,29 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); - - cow_start = (u64)-1; - cur_offset = start; while (1) { + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u64 extent_end; + u64 extent_offset; + u64 num_bytes = 0; + u64 disk_num_bytes; + u64 ram_bytes; + int extent_type; + + nocow = false; + ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; + + /* + * If there is no extent for our range when doing the initial + * search, then go back to the previous slot as it will be the + * one containing the search offset + */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1324,8 +1415,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } - check_prev = 0; + check_prev = false; next_slot: + /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -1339,28 +1431,40 @@ next_slot: leaf = path->nodes[0]; } - nocow = 0; - disk_bytenr = 0; - num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; + /* + * Keep searching until we find an EXTENT_ITEM or there are no + * more extents for this inode + */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } + + /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; + /* + * If the found extent starts after requested offset, then + * adjust extent_end to be right before this extent begins + */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto out_check; } + /* + * Found extent which begins before our range and potentially + * intersect it + */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1374,26 +1478,36 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (extent_end <= start) { + /* + * If the extent we got ends before our current offset, + * skip to the next extent. + */ + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } + /* Skip holes */ if (disk_bytenr == 0) goto out_check; + /* Skip compressed/encrypted/encoded extents */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; /* - * Do the same check as in btrfs_cross_ref_exist but - * without the unnecessary search. + * If extent is created before the last volume's snapshot + * this implies the extent is shared, hence we can't do + * nocow. This is the same check as in + * btrfs_cross_ref_exist but without calling + * btrfs_search_slot. */ - if (!nolock && + if (!freespace_inode && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; ret = btrfs_cross_ref_exist(root, ino, @@ -1410,17 +1524,17 @@ next_slot: goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* - * if there are pending snapshots for this root, - * we fall into common COW way. + * If there are pending snapshots for this root, we + * fall into common COW way */ - if (!nolock && atomic_read(&root->snapshot_force_cow)) + if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out_check; /* * force cow if csum exists in the range. @@ -1439,27 +1553,29 @@ next_slot: cur_offset = cow_start; goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - nocow = 1; + nocow = true; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + - btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_end, - fs_info->sectorsize); + extent_end = found_key.offset + ram_bytes; + extent_end = ALIGN(extent_end, fs_info->sectorsize); + /* Skip extents outside of our requested range */ + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } } else { - BUG_ON(1); + /* If this triggers then we have a memory corruption */ + BUG(); } out_check: - if (extent_end <= start) { - path->slots[0]++; - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - goto next_slot; - } + /* + * If nocow is false then record the beginning of the range + * that needs to be COWed + */ if (!nocow) { if (cow_start == (u64)-1) cow_start = cur_offset; @@ -1471,11 +1587,16 @@ out_check: } btrfs_release_path(path); + + /* + * COW range from cow_start to found_key.offset - 1. As the key + * will contain the beginning of the first extent that can be + * NOCOW, following one which needs to be COW'ed + */ if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - end, page_started, nr_written, 1, - NULL); + page_started, nr_written, 1); if (ret) { if (nocow) btrfs_dec_nocow_writers(fs_info, @@ -1487,6 +1608,7 @@ out_check: if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset; + struct extent_map *em; em = create_io_em(inode, cur_offset, num_bytes, orig_start, @@ -1503,19 +1625,29 @@ out_check: goto error; } free_extent_map(em); - } - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - type = BTRFS_ORDERED_PREALLOC; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_PREALLOC); + if (ret) { + btrfs_drop_extent_cache(BTRFS_I(inode), + cur_offset, + cur_offset + num_bytes - 1, + 0); + goto error; + } } else { - type = BTRFS_ORDERED_NOCOW; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_NOCOW); + if (ret) + goto error; } - ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, - num_bytes, num_bytes, type); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); - BUG_ON(ret); /* -ENOMEM */ + nocow = false; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -1528,7 +1660,7 @@ out_check: num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, end, + cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1553,15 +1685,18 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); if (ret) goto error; } error: + if (nocow) + btrfs_dec_nocow_writers(fs_info, disk_bytenr); + if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, cur_offset, end, end, + extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1602,7 +1737,6 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, { int ret; int force_cow = need_force_cow(inode, start, end); - unsigned int write_flags = wbc_to_write_flags(wbc); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1610,15 +1744,15 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, end, - page_started, nr_written, 1, NULL); + } else if (!inode_can_compress(inode) || + !inode_need_compress(inode, start, end)) { + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); - ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written, - write_flags); + ret = cow_file_range_async(inode, wbc, locked_page, start, end, + page_started, nr_written); } if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -1912,17 +2046,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, u64 length = 0; u64 map_length; int ret; + struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, + &geom); if (ret < 0) return ret; - if (map_length < length + size) + + if (geom.len < length + size) return 1; return 0; } @@ -1964,11 +2100,11 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) + { - struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; @@ -2003,8 +2139,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, - bio_offset, inode, - btrfs_submit_bio_start); + 0, inode, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, 0, 0); @@ -2013,7 +2148,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, } mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); out: if (ret) { @@ -2046,7 +2181,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe) + struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, @@ -2112,17 +2247,21 @@ again: } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - &cached_state, 0); + &cached_state); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); ClearPageChecked(page); - goto out; + goto out_reserved; } ClearPageChecked(page); set_page_dirty(page); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); +out_reserved: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, page_start, + PAGE_SIZE, true); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2163,8 +2302,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) SetPageChecked(page); get_page(page); - btrfs_init_work(&fixup->work, btrfs_fixup_helper, - btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2531,6 +2669,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, struct btrfs_file_extent_item *item; struct btrfs_ordered_extent *ordered; struct btrfs_trans_handle *trans; + struct btrfs_ref ref = { 0 }; struct btrfs_root *root; struct btrfs_key key; struct extent_buffer *leaf; @@ -2577,7 +2716,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) { srcu_read_unlock(&fs_info->subvol_srcu, index); return 0; @@ -2701,10 +2840,11 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, root, new->bytenr, - new->disk_len, 0, - backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr, + new->disk_len, 0); + btrfs_init_data_ref(&ref, backref->root_id, backref->inum, + new->file_pos); /* start - extent_offset */ + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -2900,7 +3040,7 @@ out_kfree: static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); @@ -2928,7 +3068,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->len; - bool nolock; + bool freespace_inode; bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; @@ -2939,7 +3079,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) clear_new_delalloc_bytes = true; - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); + freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -2970,8 +3110,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (nolock) - trans = btrfs_join_transaction_nolock(root); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3005,8 +3145,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 0, 0, &cached_state); } - if (nolock) - trans = btrfs_join_transaction_nolock(root); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3155,7 +3295,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -3164,16 +3303,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, - NULL); + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered_extent->work); } @@ -3182,16 +3317,23 @@ static int __readpage_endio_check(struct inode *inode, int icsum, struct page *page, int pgoff, u64 start, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u32 csum_expected; - u32 csum = ~(u32)0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = *(((u32 *)io_bio->csum) + icsum); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (u8 *)&csum); - if (csum != csum_expected) + shash->tfm = fs_info->csum_shash; + + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + pgoff, len); + crypto_shash_final(shash, csum); + + if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); @@ -3265,6 +3407,28 @@ void btrfs_add_delayed_iput(struct inode *inode) wake_up_process(fs_info->cleaner_kthread); } +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock(&fs_info->delayed_iput_lock); + } +} + void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -3274,12 +3438,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); - iput(&inode->vfs_inode); - if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) - wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + run_delayed_iput_locked(fs_info, inode); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -3408,7 +3567,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); + inode = btrfs_iget(fs_info->sb, &found_key, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; @@ -3699,21 +3858,6 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; - /* - * Similar reasoning for last_link_trans, needs to be set otherwise - * for a case like the following: - * - * mkdir A - * touch foo - * ln foo A/bar - * echo 2 > /proc/sys/vm/drop_caches - * fsync foo - * <power failure> - * - * Would result in link bar and directory A not existing after the power - * failure. - */ - BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -3795,7 +3939,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); @@ -3929,9 +4073,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; - struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct btrfs_key key; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); @@ -3949,8 +4091,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = di ? PTR_ERR(di) : -ENOENT; goto err; } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; @@ -4003,6 +4143,17 @@ skip_backref: ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) @@ -4087,18 +4238,30 @@ out: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, u64 objectid, - const char *name, int name_len) + struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; + const char *name = dentry->d_name.name; + int name_len = dentry->d_name.len; u64 index; int ret; + u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { + objectid = inode->root->root_key.objectid; + } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + objectid = inode->location.objectid; + } else { + WARN_ON(1); + return -EINVAL; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4120,13 +4283,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, - dir_ino, &index, name, name_len); - if (ret < 0) { - if (ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto out; - } + /* + * This is a placeholder inode for a subvolume we didn't have a + * reference to at the time of the snapshot creation. In the meantime + * we could have renamed the real subvol link into our snapshot, so + * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * Instead simply lookup the dir_index_item for this entry so we can + * remove it. Otherwise we know we have a ref to the root and we can + * call btrfs_del_root_ref, and it _shouldn't_ fail. + */ + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); if (IS_ERR_OR_NULL(di)) { @@ -4141,8 +4307,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; + btrfs_release_path(path); + } else { + ret = btrfs_del_root_ref(trans, objectid, + root->root_key.objectid, dir_ino, + &index, name, name_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } - btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); if (ret) { @@ -4336,8 +4510,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid, - dentry->d_name.name, dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { err = ret; btrfs_abort_transaction(trans, ret); @@ -4432,10 +4605,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(trans); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, - BTRFS_I(inode)->location.objectid, - dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_subvol(trans, dir, dentry); goto out; } @@ -4679,7 +4849,7 @@ search_again: btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); + btrfs_truncate_item(path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -4718,12 +4888,17 @@ delete: if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root)) { + struct btrfs_ref ref = { 0 }; + btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, 0, - btrfs_header_owner(leaf), - ino, extent_offset); + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + ino, extent_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -4844,7 +5019,7 @@ again: if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } @@ -4879,12 +5054,11 @@ again: } clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, - &cached_state, 0); + &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -4912,7 +5086,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -4997,21 +5171,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(io_tree, hole_start, block_end - 1, - &cached_state); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, - block_end - hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - + btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, @@ -5047,7 +5208,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; @@ -5278,9 +5438,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, &cached_state); + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5293,59 +5453,50 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); - int failures = 0; - - for (;;) { - struct btrfs_trans_handle *trans; - int ret; - - ret = btrfs_block_rsv_refill(root, rsv, - rsv->size + delayed_refs_extra, - BTRFS_RESERVE_FLUSH_LIMIT); - - if (ret && ++failures > 2) { - btrfs_warn(fs_info, - "could not allocate space for a delete; will truncate on mount"); - return ERR_PTR(-ENOSPC); - } - - /* - * Evict can generate a large amount of delayed refs without - * having a way to add space back since we exhaust our temporary - * block rsv. We aren't allowed to do FLUSH_ALL in this case - * because we could deadlock with so many things in the flushing - * code, so we have to try and hold some extra space to - * compensate for our delayed ref generation. If we can't get - * that space then we need see if we can steal our minimum from - * the global reserve. We will be ratelimited by the amount of - * space we have for the delayed refs rsv, so we'll end up - * committing and trying again. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) { - if (!IS_ERR(trans)) { - trans->block_rsv = &fs_info->trans_block_rsv; - trans->bytes_reserved = delayed_refs_extra; - btrfs_block_rsv_migrate(rsv, trans->block_rsv, - delayed_refs_extra, 1); - } - return trans; - } + struct btrfs_trans_handle *trans; + u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + int ret; + /* + * Eviction should be taking place at some place safe because of our + * delayed iputs. However the normal flushing code will run delayed + * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. + * + * We reserve the delayed_refs_extra here again because we can't use + * btrfs_start_transaction(root, 0) for the same deadlocky reason as + * above. We reserve our extra bit here because we generate a ton of + * delayed refs activity by truncating. + * + * If we cannot make our reservation we'll attempt to steal from the + * global reserve, because we really want to be able to free up space. + */ + ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { /* * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) - return trans; + if (btrfs_check_space_for_delayed_refs(fs_info) || + btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + btrfs_warn(fs_info, + "could not allocate space for delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + delayed_refs_extra = 0; + } - /* If not, commit and try again. */ - ret = btrfs_commit_transaction(trans); - if (ret) - return ERR_PTR(ret); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return trans; + + if (delayed_refs_extra) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); } + return trans; } void btrfs_evict_inode(struct inode *inode) @@ -5392,7 +5543,7 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); + rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5448,12 +5599,14 @@ no_delete: } /* - * this returns the key found in the dir entry in the location pointer. + * Return the key found in the dir entry in the location pointer, fill @type + * with BTRFS_FT_*, and return 0. + * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, - struct btrfs_key *location) + struct btrfs_key *location, u8 *type) { const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -5482,6 +5635,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, __func__, name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } + if (!ret) + *type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); return ret; @@ -5592,7 +5747,6 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5605,7 +5759,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -5649,12 +5802,14 @@ static struct inode *btrfs_iget_locked(struct super_block *s, return inode; } -/* Get an inode object given its location and corresponding root. - * Returns in *is_new if the inode was read from disk +/* + * Get an inode object given its location and corresponding root. + * Path can be preallocated to prevent recursing back to iget through + * allocator. NULL is also valid but may require an additional allocation + * later. */ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new, - struct btrfs_path *path) + struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; @@ -5669,8 +5824,6 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); - if (new) - *new = 1; } else { iget_failed(inode); /* @@ -5688,9 +5841,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, } struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) + struct btrfs_root *root) { - return btrfs_iget_path(s, location, root, new, NULL); + return btrfs_iget_path(s, location, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, @@ -5719,6 +5872,24 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static inline u8 btrfs_inode_type(struct inode *inode) +{ + /* + * Compile-time asserts that generic FT_* types still match + * BTRFS_FT_* types + */ + BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); + BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); + BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); + BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); + BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); + BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); + BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); + BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); + + return fs_umode_to_ftype(inode->i_mode); +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -5726,18 +5897,31 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; + u8 di_type = 0; int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root, NULL); + inode = btrfs_iget(dir->i_sb, &location, root); + if (IS_ERR(inode)) + return inode; + + /* Do extra check against inode mode with di_type */ + if (btrfs_inode_type(inode) != di_type) { + btrfs_crit(fs_info, +"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", + inode->i_mode, btrfs_inode_type(inode), + di_type); + iput(inode); + return ERR_PTR(-EUCLEAN); + } return inode; } @@ -5750,7 +5934,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); + inode = btrfs_iget(dir->i_sb, &location, sub_root); } srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -5797,10 +5981,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy @@ -5939,7 +6119,7 @@ again: name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); @@ -6190,13 +6370,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); @@ -6342,11 +6525,6 @@ fail: return ERR_PTR(ret); } -static inline u8 btrfs_inode_type(struct inode *inode) -{ - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; -} - /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. @@ -6396,8 +6574,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = - current_time(&parent_inode->vfs_inode); + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { + struct timespec64 now = current_time(&parent_inode->vfs_inode); + + parent_inode->vfs_inode.i_mtime = now; + parent_inode->vfs_inode.i_ctime = now; + } ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6634,7 +6822,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } - BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6796,8 +6983,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - if (em) - em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); if (em) { @@ -6813,7 +6998,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, err = -ENOMEM; goto out; } - em->bdev = fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; @@ -6864,6 +7048,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_start = found_key.offset; if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + /* Only regular file could have regular/prealloc extent */ + if (!S_ISREG(inode->vfs_inode.i_mode)) { + ret = -EUCLEAN; + btrfs_crit(fs_info, + "regular/prealloc extent found for non-regular inode %llu", + btrfs_ino(inode)); + goto out; + } extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); @@ -7064,7 +7256,6 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, err = -ENOMEM; goto out; } - em->bdev = NULL; ASSERT(hole_em); /* @@ -7424,7 +7615,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, { struct extent_map_tree *em_tree; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; ASSERT(type == BTRFS_ORDERED_PREALLOC || @@ -7442,7 +7632,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, em->len = len; em->block_len = block_len; em->block_start = block_start; - em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; @@ -7481,6 +7670,8 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em, struct inode *inode, u64 start, u64 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + if (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return -ENOENT; @@ -7490,7 +7681,7 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em, bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; bh_result->b_size = len; - bh_result->b_bdev = em->bdev; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; set_buffer_mapped(bh_result); return 0; @@ -7573,7 +7764,7 @@ skip_cow: bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; bh_result->b_size = len; - bh_result->b_bdev = em->bdev; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; set_buffer_mapped(bh_result); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) @@ -7604,12 +7795,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - int unlock_bits = EXTENT_LOCKED; int ret = 0; - if (create) - unlock_bits |= EXTENT_DIRTY; - else + if (!create) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; @@ -7668,9 +7856,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (ret < 0) goto unlock_err; - /* clear and unlock the entire range */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); } else { ret = btrfs_get_blocks_direct_read(em, bh_result, inode, start, len); @@ -7686,9 +7873,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ lockstart = start + bh_result->b_size; if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); } else { free_extent_state(cached_state); } @@ -7699,8 +7885,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return 0; unlock_err: - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -7720,7 +7906,7 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode, if (ret) return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); return ret; } @@ -7828,7 +8014,6 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct inode *inode = done->inode; struct bio_vec *bvec; struct extent_io_tree *io_tree, *failure_tree; - int i; struct bvec_iter_all iter_all; if (bio->bi_status) @@ -7841,7 +8026,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, io_tree, done->start, bvec->bv_page, btrfs_ino(BTRFS_I(inode)), 0); @@ -7919,7 +8104,7 @@ static void btrfs_retry_endio(struct bio *bio) struct bio_vec *bvec; int uptodate; int ret; - int i; + int i = 0; struct bvec_iter_all iter_all; if (bio->bi_status) @@ -7934,7 +8119,7 @@ static void btrfs_retry_endio(struct bio *bio) failure_tree = &BTRFS_I(inode)->io_failure_tree; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, bvec->bv_offset, done->start, bvec->bv_len); @@ -7946,6 +8131,7 @@ static void btrfs_retry_endio(struct bio *bio) bvec->bv_offset); else uptodate = 0; + i++; } done->uptodate = uptodate; @@ -8073,18 +8259,14 @@ static void __endio_write_update_ordered(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } while (ordered_offset < offset + bytes) { last_offset = ordered_offset; @@ -8092,9 +8274,8 @@ static void __endio_write_update_ordered(struct inode *inode, &ordered_offset, ordered_bytes, uptodate)) { - btrfs_init_work(&ordered->work, func, - finish_ordered_fn, - NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, + NULL); btrfs_queue_work(wq, &ordered->work); } /* @@ -8251,7 +8432,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto err; } map: - ret = btrfs_map_bio(fs_info, bio, 0, 0); + ret = btrfs_map_bio(fs_info, bio, 0); err: return ret; } @@ -8264,22 +8445,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) struct bio *orig_bio = dip->orig_bio; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 map_length; int async_submit = 0; u64 submit_len; int clone_offset = 0; int clone_len; int ret; blk_status_t status; + struct btrfs_io_geometry geom; - map_length = orig_bio->bi_iter.bi_size; - submit_len = map_length; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, - &map_length, NULL, 0); + submit_len = orig_bio->bi_iter.bi_size; + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) return -EIO; - if (map_length >= submit_len) { + if (geom.len >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; @@ -8292,10 +8472,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) async_submit = 1; /* bio split */ - ASSERT(map_length <= INT_MAX); + ASSERT(geom.len <= INT_MAX); atomic_inc(&dip->pending_bios); do { - clone_len = min_t(int, submit_len, map_length); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and @@ -8332,9 +8512,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) start_sector += clone_len >> 9; file_offset += clone_len; - map_length = submit_len; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) goto out_err; } while (submit_len > 0); @@ -8586,7 +8765,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -8717,8 +8896,7 @@ again: */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8773,8 +8951,7 @@ again: if (PageDirty(page)) btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | + clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state); @@ -8902,12 +9079,11 @@ again: * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, - &cached_state, 0); + &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); @@ -8939,7 +9115,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -8948,7 +9124,7 @@ again: out_unlock: unlock_page(page); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: @@ -8965,7 +9141,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); + u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -9163,7 +9339,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; - ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9182,13 +9357,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, inode); - extent_io_tree_init(&ei->io_failure_tree, inode); - ei->io_tree.track_uptodate = 1; - ei->io_failure_tree.track_uptodate = 1; + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_failure_tree, + IO_TREE_INODE_IO_FAILURE, inode); + ei->io_tree.track_uptodate = true; + ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); - mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); @@ -9206,9 +9381,8 @@ void btrfs_test_destroy_inode(struct inode *inode) } #endif -static void btrfs_i_callback(struct rcu_head *head) +void btrfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -9234,7 +9408,7 @@ void btrfs_destroy_inode(struct inode *inode) * created. */ if (!root) - goto free; + return; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -9252,8 +9426,6 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); -free: - call_rcu(&inode->i_rcu, btrfs_i_callback); } int btrfs_drop_inode(struct inode *inode) @@ -9288,6 +9460,7 @@ void __cold btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -9317,6 +9490,12 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_RED_ZONE, NULL); + if (!btrfs_free_space_bitmap_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -9376,7 +9555,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; - u64 root_objectid; int ret; bool root_log_pinned = false; bool dest_log_pinned = false; @@ -9394,9 +9572,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* @@ -9413,6 +9590,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. @@ -9430,7 +9610,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); root_log_pinned = true; @@ -9447,7 +9627,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(dest); dest_log_pinned = true; @@ -9480,10 +9660,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), @@ -9499,10 +9676,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), @@ -9583,7 +9757,7 @@ out_fail: btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (root_log_pinned) { btrfs_end_log_trans(root); @@ -9607,6 +9781,18 @@ out_fail: commit_transaction = true; } if (commit_transaction) { + /* + * We may have set commit_transaction when logging the new name + * in the destination root, in which case we left the source + * root context in the list of log contextes. So make sure we + * remove it to avoid invalid memory accesses, since the context + * was allocated in our stack frame. + */ + if (sync_log_root) { + mutex_lock(&root->log_mutex); + list_del_init(&ctx_root.list); + mutex_unlock(&root->log_mutex); + } ret = btrfs_commit_transaction(trans); } else { int ret2; @@ -9615,11 +9801,13 @@ out_fail: ret = ret ? ret : ret2; } out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + ASSERT(list_empty(&ctx_root.list)); + ASSERT(list_empty(&ctx_dest.list)); + return ret; } @@ -9686,7 +9874,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); u64 index = 0; - u64 root_objectid; int ret; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; @@ -9769,7 +9956,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); log_pinned = true; @@ -9794,10 +9981,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), @@ -9816,10 +10000,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - root_objectid = BTRFS_I(new_inode)->location.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), @@ -9890,7 +10071,7 @@ out_fail: btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); log_pinned = false; @@ -9964,8 +10145,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, - btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -10193,7 +10373,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); @@ -10299,7 +10478,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; @@ -10655,7 +10833,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; u64 len = isize - start; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cd4e693406a0..12ae31e1813e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,9 @@ #include "qgroup.h" #include "tree-log.h" #include "compression.h" +#include "space-info.h" +#include "delalloc-space.h" +#include "block-group.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -189,9 +192,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; - u64 old_flags; - unsigned int old_i_flags; - umode_t mode; + const char *comp = NULL; + u32 binode_flags = binode->flags; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -212,66 +214,59 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode_lock(inode); - old_flags = binode->flags; - old_i_flags = inode->i_flags; - mode = inode->i_mode; - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); - if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto out_unlock; - } - } + ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); + if (ret) + goto out_unlock; if (fsflags & FS_SYNC_FL) - binode->flags |= BTRFS_INODE_SYNC; + binode_flags |= BTRFS_INODE_SYNC; else - binode->flags &= ~BTRFS_INODE_SYNC; + binode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode->flags |= BTRFS_INODE_IMMUTABLE; + binode_flags |= BTRFS_INODE_IMMUTABLE; else - binode->flags &= ~BTRFS_INODE_IMMUTABLE; + binode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode->flags |= BTRFS_INODE_APPEND; + binode_flags |= BTRFS_INODE_APPEND; else - binode->flags &= ~BTRFS_INODE_APPEND; + binode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode->flags |= BTRFS_INODE_NODUMP; + binode_flags |= BTRFS_INODE_NODUMP; else - binode->flags &= ~BTRFS_INODE_NODUMP; + binode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode->flags |= BTRFS_INODE_NOATIME; + binode_flags |= BTRFS_INODE_NOATIME; else - binode->flags &= ~BTRFS_INODE_NOATIME; + binode_flags &= ~BTRFS_INODE_NOATIME; if (fsflags & FS_DIRSYNC_FL) - binode->flags |= BTRFS_INODE_DIRSYNC; + binode_flags |= BTRFS_INODE_DIRSYNC; else - binode->flags &= ~BTRFS_INODE_DIRSYNC; + binode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(mode)) { + if (S_ISREG(inode->i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ if (inode->i_size == 0) - binode->flags |= BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM; + binode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode->flags |= BTRFS_INODE_NODATACOW; + binode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(mode)) { + if (S_ISREG(inode->i_mode)) { if (inode->i_size == 0) - binode->flags &= ~(BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM); + binode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode->flags &= ~BTRFS_INODE_NODATACOW; + binode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -281,57 +276,59 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode->flags &= ~BTRFS_INODE_COMPRESS; - binode->flags |= BTRFS_INODE_NOCOMPRESS; - - ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) - goto out_drop; + binode_flags &= ~BTRFS_INODE_COMPRESS; + binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - const char *comp; if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_unlock; } - binode->flags |= BTRFS_INODE_COMPRESS; - binode->flags &= ~BTRFS_INODE_NOCOMPRESS; + binode_flags |= BTRFS_INODE_COMPRESS; + binode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); - - ret = btrfs_set_prop(inode, "btrfs.compression", - comp, strlen(comp), 0); - if (ret) - goto out_drop; - } else { - ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) - goto out_drop; - binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_start_transaction(root, 1); + /* + * 1 for inode item + * 2 for properties + */ + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_drop; + goto out_unlock; + } + + if (comp) { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, + strlen(comp), 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } else { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, + 0, 0); + if (ret && ret != -ENODATA) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } } + binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); + out_end_trans: btrfs_end_transaction(trans); - out_drop: - if (ret) { - binode->flags = old_flags; - inode->i_flags = old_i_flags; - } - out_unlock: inode_unlock(inode); mnt_drop_write_file(file); @@ -379,9 +376,7 @@ static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) struct btrfs_inode *binode = BTRFS_I(file_inode(file)); struct fsxattr fa; - memset(&fa, 0, sizeof(fa)); - fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags); - + simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags)); if (copy_to_user(arg, &fa, sizeof(fa))) return -EFAULT; @@ -394,7 +389,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; - struct fsxattr fa; + struct fsxattr fa, old_fa; unsigned old_flags; unsigned old_i_flags; int ret = 0; @@ -405,7 +400,6 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) if (btrfs_root_readonly(root)) return -EROFS; - memset(&fa, 0, sizeof(fa)); if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -425,13 +419,11 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) old_flags = binode->flags; old_i_flags = inode->i_flags; - /* We need the capabilities to change append-only or immutable inode */ - if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) || - (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) && - !capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; + simple_fill_fsxattr(&old_fa, + btrfs_inode_flags_to_xflags(binode->flags)); + ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); + if (ret) goto out_unlock; - } if (fa.fsx_xflags & FS_XFLAG_SYNC) binode->flags |= BTRFS_INODE_SYNC; @@ -487,10 +479,9 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) return put_user(inode->i_generation, arg); } -static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; @@ -549,7 +540,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -int btrfs_is_empty_uuid(u8 *uuid) +int __pure btrfs_is_empty_uuid(u8 *uuid) { int i; @@ -713,11 +704,17 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); @@ -1341,9 +1338,8 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state); + page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1369,8 +1365,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1381,8 +1376,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - true); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; @@ -1420,7 +1414,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, return -EINVAL; if (do_compress) { - if (range->compress_type > BTRFS_COMPRESS_TYPES) + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) compress_type = range->compress_type; @@ -1849,8 +1843,15 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + btrfs_warn(fs_info, +"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); + ptr = &transid; + } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -2466,7 +2467,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, goto out; } - temp_inode = btrfs_iget(sb, &key2, root, NULL); + temp_inode = btrfs_iget(sb, &key2, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out; @@ -2931,8 +2932,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, inode_lock(inode); err = btrfs_delete_subvolume(dir, dentry); inode_unlock(inode); - if (!err) + if (!err) { + fsnotify_rmdir(dir, dentry); d_delete(dentry); + } out_dput: dput(dentry); @@ -3260,6 +3263,19 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, { int ret; u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); @@ -3268,7 +3284,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, dst, dst_loff); if (ret) - return ret; + goto out; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3277,6 +3293,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); return ret; } @@ -3314,61 +3334,6 @@ out: return ret; } -static void clone_update_extent_map(struct btrfs_inode *inode, - const struct btrfs_trans_handle *trans, - const struct btrfs_path *path, - const u64 hole_offset, - const u64 hole_len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - em = alloc_extent_map(); - if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); - return; - } - - if (path) { - struct btrfs_file_extent_item *fi; - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - btrfs_extent_item_to_extent_map(inode, path, fi, false, em); - em->generation = -1; - if (btrfs_file_extent_type(path->nodes[0], fi) == - BTRFS_FILE_EXTENT_INLINE) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); - } else { - em->start = hole_offset; - em->len = hole_len; - em->ram_bytes = em->len; - em->orig_start = hole_offset; - em->block_start = EXTENT_MAP_HOLE; - em->block_len = 0; - em->orig_block_len = 0; - em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = trans->transid; - } - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - } - - if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); -} - /* * Make sure we do not end up inserting an inline extent into a file that has * already other (non-inline) extents. If a file has an inline extent it can @@ -3509,6 +3474,7 @@ copy_inline_extent: path->slots[0]), size); inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); return 0; } @@ -3560,6 +3526,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode, while (1) { u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; /* * note the key will change type as we walk through the @@ -3600,75 +3574,115 @@ process_slot: key.objectid != btrfs_ino(BTRFS_I(src))) break; - if (key.type == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, - extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, - extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, - extent); - } + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; /* - * The first search might have left us at an extent - * item that ends before our target range's start, can - * happen if we have holes and NO_HOLES feature enabled. + * a | --- range to clone ---| b + * | ------------- extent ------------- | */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - btrfs_release_path(path); - path->leave_spinning = 0; + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, + drop_start, + new_key.offset + datal - 1, + &clone_info, &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } - /* - * Deal with a hole that doesn't have an extent item - * that represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning - * range or at the beginning (fully overlaps it or - * partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; /* + * If our extent is inline, we know we will drop or + * adjust at most 1 extent item in the destination root. + * * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3679,137 +3693,28 @@ process_slot: goto out; } - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - new_key.offset + datal, - 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { + ret = clone_copy_inline_extent(inode, trans, path, + &new_key, drop_start, + datal, skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - /* disko == 0 means it's a hole */ - if (!disko) - datao = 0; - - btrfs_set_file_extent_offset(leaf, extent, - datao); - btrfs_set_file_extent_num_bytes(leaf, extent, - datal); - - if (disko) { - inode_add_bytes(inode, datal); - ret = btrfs_inc_extent_ref(trans, - root, - disko, diskl, 0, - root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), - new_key.offset - datao); - if (ret) { - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - - } - } - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - btrfs_end_transaction(trans); - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - ret = clone_copy_inline_extent(inode, - trans, path, - &new_key, - drop_start, - datal, - skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - leaf = path->nodes[0]; - slot = path->slots[0]; + btrfs_end_transaction(trans); + goto out; } + } - /* If we have an implicit hole (NO_HOLES feature). */ - if (drop_start < new_key.offset) - clone_update_extent_map(BTRFS_I(inode), trans, - NULL, drop_start, - new_key.offset - drop_start); - - clone_update_extent_map(BTRFS_I(inode), trans, - path, 0, 0); + btrfs_release_path(path); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, - last_dest_end, - destoff, olen, - no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - } btrfs_release_path(path); key.offset = next_key_min_offset; @@ -3822,31 +3727,20 @@ process_slot: if (last_dest_end < destoff + len) { /* - * We have an implicit hole (NO_HOLES feature is enabled) that - * fully or partially overlaps our cloning range at its end. + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); + path->leave_spinning = 0; - /* - * 1 - remove extent(s) - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_drop_extents(trans, root, inode, - last_dest_end, destoff + len, 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); + ret = btrfs_punch_hole_range(inode, path, + last_dest_end, destoff + len - 1, + NULL, &trans); + if (ret) goto out; - } - clone_update_extent_map(BTRFS_I(inode), trans, NULL, - last_dest_end, - destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } @@ -3948,16 +3842,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, return -EXDEV; } - if (same_inode) - inode_lock(inode_in); - else - lock_two_nondirectories(inode_in, inode_out); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; + return -EINVAL; } /* @@ -3988,29 +3876,38 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (!same_inode) inode_dio_wait(inode_out); + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) - goto out_unlock; + return ret; ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) - goto out_unlock; + return ret; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret < 0 || *len == 0) - goto out_unlock; - - return 0; - - out_unlock: - if (same_inode) - inode_unlock(inode_in); - else - unlock_two_nondirectories(inode_in, inode_out); - - return ret; } loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, @@ -4025,16 +3922,22 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; + if (same_inode) + inode_lock(src_inode); + else + lock_two_nondirectories(src_inode, dst_inode); + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); if (ret < 0 || len == 0) - return ret; + goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); +out_unlock: if (same_inode) inode_unlock(src_inode); else @@ -4128,16 +4031,15 @@ out: static void get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; space->total_bytes = 0; space->used_bytes = 0; space->flags = 0; list_for_each_entry(block_group, groups_list, list) { space->flags = block_group->flags; - space->total_bytes += block_group->key.offset; - space->used_bytes += - btrfs_block_group_used(&block_group->item); + space->total_bytes += block_group->length; + space->used_bytes += block_group->used; } } @@ -4350,7 +4252,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); - if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) + /* + * Copy scrub args to user space even if btrfs_scrub_dev() returned an + * error. This is important as it allows user space to know how much + * progress scrub has done. For example, if scrub is canceled we get + * -ECANCELED from btrfs_scrub_dev() and return that error back to user + * space. Later user space can inspect the progress from the structure + * btrfs_ioctl_scrub_args and resume scrub from where it left off + * previously (btrfs-progs does this). + * If we fail to copy the btrfs_ioctl_scrub_args structure to user space + * then return -EFAULT to signal the structure was not copied or it may + * be corrupt and unreliable due to a partial copy. + */ + if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) @@ -5048,10 +4962,9 @@ drop_write: return ret; } -static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_rescan_args *qsa; int ret = 0; @@ -5074,11 +4987,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) return ret; } -static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -5250,10 +5161,9 @@ out: return ret; } -static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); size_t len; int ret; char label[BTRFS_LABEL_SIZE]; @@ -5337,10 +5247,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg) return 0; } -static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags features; @@ -5359,7 +5268,7 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) { - const char *type = btrfs_feature_set_names[set]; + const char *type = btrfs_feature_set_name(set); char *names; u64 disallowed, unsupported; u64 set_mask = flags & change_mask; @@ -5540,8 +5449,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FS_IOC_GETFSLABEL: + return btrfs_ioctl_get_fslabel(fs_info, argp); + case FS_IOC_SETFSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: - return btrfs_ioctl_fitrim(file, argp); + return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: @@ -5646,19 +5559,15 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN: return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: - return btrfs_ioctl_quota_rescan_status(file, argp); + return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(file, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); - case BTRFS_IOC_GET_FSLABEL: - return btrfs_ioctl_get_fslabel(file, argp); - case BTRFS_IOC_SET_FSLABEL: - return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: - return btrfs_ioctl_get_features(file, argp); + return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case FS_IOC_FSGETXATTR: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 82b84e4daad1..571c4826c428 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,14 +8,194 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); +/* + * Extent buffer locking + * ===================== + * + * The locks use a custom scheme that allows to do more operations than are + * available fromt current locking primitives. The building blocks are still + * rwlock and wait queues. + * + * Required semantics: + * + * - reader/writer exclusion + * - writer/writer exclusion + * - reader/reader sharing + * - spinning lock semantics + * - blocking lock semantics + * - try-lock semantics for readers and writers + * - one level nesting, allowing read lock to be taken by the same thread that + * already has write lock + * + * The extent buffer locks (also called tree locks) manage access to eb data + * related to the storage in the b-tree (keys, items, but not the individual + * members of eb). + * We want concurrency of many readers and safe updates. The underlying locking + * is done by read-write spinlock and the blocking part is implemented using + * counters and wait queues. + * + * spinning semantics - the low-level rwlock is held so all other threads that + * want to take it are spinning on it. + * + * blocking semantics - the low-level rwlock is not held but the counter + * denotes how many times the blocking lock was held; + * sleeping is possible + * + * Write lock always allows only one thread to access the data. + * + * + * Debugging + * --------- + * + * There are additional state counters that are asserted in various contexts, + * removed from non-debug build to reduce extent_buffer size and for + * performance reasons. + * + * + * Lock nesting + * ------------ + * + * A write operation on a tree might indirectly start a look up on the same + * tree. This can happen when btrfs_cow_block locks the tree and needs to + * lookup free extents. + * + * btrfs_cow_block + * .. + * alloc_tree_block_no_bg_flush + * btrfs_alloc_tree_block + * btrfs_reserve_extent + * .. + * load_free_space_cache + * .. + * btrfs_lookup_file_extent + * btrfs_search_slot + * + * + * Locking pattern - spinning + * -------------------------- + * + * The simple locking scenario, the +--+ denotes the spinning section. + * + * +- btrfs_tree_lock + * | - extent_buffer::rwlock is held + * | - no heavy operations should happen, eg. IO, memory allocations, large + * | structure traversals + * +- btrfs_tree_unock +* +* + * Locking pattern - blocking + * -------------------------- + * + * The blocking write uses the following scheme. The +--+ denotes the spinning + * section. + * + * +- btrfs_tree_lock + * | + * +- btrfs_set_lock_blocking_write + * + * - allowed: IO, memory allocations, etc. + * + * -- btrfs_tree_unlock - note, no explicit unblocking necessary + * + * + * Blocking read is similar. + * + * +- btrfs_tree_read_lock + * | + * +- btrfs_set_lock_blocking_read + * + * - heavy operations allowed + * + * +- btrfs_tree_read_unlock_blocking + * | + * +- btrfs_tree_read_unlock + * + */ + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) +{ + WARN_ON(eb->spinning_writers); + eb->spinning_writers++; +} + +static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) +{ + WARN_ON(eb->spinning_writers != 1); + eb->spinning_writers--; +} + +static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) +{ + WARN_ON(eb->spinning_writers); +} + +static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->spinning_readers); +} + +static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); +} + +static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->read_locks); +} + +static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) +{ + atomic_dec(&eb->read_locks); +} + +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); +} + +static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) +{ + eb->write_locks++; +} + +static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) +{ + eb->write_locks--; +} +#else +static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } +static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } +static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } +#endif + +/* + * Mark already held read lock as blocking. Can be nested in write lock by the + * same thread. + * + * Use when there are potentially long operations ahead so other thread waiting + * on the lock will not actively spin but sleep instead. + * + * The rwlock is released and blocking reader counter is increased. + */ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { + trace_btrfs_set_lock_blocking_read(eb); /* * No lock is required. The lock owner may change if we have a read * lock, but it won't change to or away from us. If we have the write @@ -25,13 +205,21 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); + btrfs_assert_spinning_readers_put(eb); read_unlock(&eb->lock); } +/* + * Mark already held write lock as blocking. + * + * Use when there are potentially long operations ahead so other threads + * waiting on the lock will not actively spin but sleep instead. + * + * The rwlock is released and blocking writers is set. + */ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) { + trace_btrfs_set_lock_blocking_write(eb); /* * No lock is required. The lock owner may change if we have a read * lock, but it won't change to or away from us. If we have the write @@ -39,153 +227,138 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (atomic_read(&eb->blocking_writers) == 0) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + if (eb->blocking_writers == 0) { + btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); + WRITE_ONCE(eb->blocking_writers, 1); write_unlock(&eb->lock); } } -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) -{ - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - atomic_inc(&eb->spinning_readers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); -} - -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) -{ - /* - * no lock is required. The lock owner may change if - * we have a read lock, but it won't change to or away - * from us. If we have the write lock, we are the owner - * and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - BUG_ON(atomic_read(&eb->blocking_writers) != 1); - write_lock(&eb->lock); - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); -} - /* - * take a spinning read lock. This will wait for any blocking - * writers + * Lock the extent buffer for read. Wait for any writers (spinning or blocking). + * Can be nested in write lock by the same thread. + * + * Use when the locked section does only lightweight actions and busy waiting + * would be cheaper than making other threads do the wait/wake loop. + * + * The rwlock is held upon exit. */ void btrfs_tree_read_lock(struct extent_buffer *eb) { -again: - BUG_ON(!atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner); + u64 start_ns = 0; + if (trace_btrfs_tree_read_lock_enabled()) + start_ns = ktime_get_ns(); +again: read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. We allow - * an additional read lock to be added because it's for the same - * thread. btrfs_find_all_roots() depends on this as it may be - * called on a partly (write-)locked tree. - */ - BUG_ON(eb->lock_nested); - eb->lock_nested = 1; - read_unlock(&eb->lock); - return; - } - if (atomic_read(&eb->blocking_writers)) { + BUG_ON(eb->blocking_writers == 0 && + current->pid == eb->lock_owner); + if (eb->blocking_writers) { + if (current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. + * We allow an additional read lock to be added because + * it's for the same thread. btrfs_find_all_roots() + * depends on this as it may be called on a partly + * (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = true; + read_unlock(&eb->lock); + trace_btrfs_tree_read_lock(eb, start_ns); + return; + } read_unlock(&eb->lock); wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); + READ_ONCE(eb->blocking_writers) == 0); goto again; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_tree_read_lock(eb, start_ns); } /* - * take a spinning read lock. - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers + * Lock extent buffer for read, optimistically expecting that there are no + * contending blocking writers. If there are, don't wait. + * + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (READ_ONCE(eb->blocking_writers)) return 0; read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers)) { read_unlock(&eb->lock); return 0; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_tree_read_lock_atomic(eb); return 1; } /* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers + * Try-lock for read. Don't block or wait for contending writers. + * + * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (READ_ONCE(eb->blocking_writers)) return 0; if (!read_trylock(&eb->lock)) return 0; - if (atomic_read(&eb->blocking_writers)) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers)) { read_unlock(&eb->lock); return 0; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_try_tree_read_lock(eb); return 1; } /* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers or readers + * Try-lock for write. May block until the lock is uncontended, but does not + * wait until it is free. + * + * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) + if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) return 0; write_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); return 0; } - atomic_inc(&eb->write_locks); - atomic_inc(&eb->spinning_writers); + btrfs_assert_tree_write_locks_get(eb); + btrfs_assert_spinning_writers_get(eb); eb->lock_owner = current->pid; + trace_btrfs_try_tree_write_lock(eb); return 1; } /* - * drop a spinning read lock + * Release read lock. Must be used only if the lock is in spinning mode. If + * the read lock is nested, must pair with read lock before the write unlock. + * + * The rwlock is not held upon exit. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { + trace_btrfs_tree_read_unlock(eb); /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. @@ -193,21 +366,25 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) * field only matters to the lock owner. */ if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; + eb->lock_nested = false; return; } btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - atomic_dec(&eb->read_locks); + btrfs_assert_spinning_readers_put(eb); + btrfs_assert_tree_read_locks_put(eb); read_unlock(&eb->lock); } /* - * drop a blocking read lock + * Release read lock, previously set to blocking by a pairing call to + * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same + * thread. + * + * State of rwlock is unchanged, last reader wakes waiting threads. */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { + trace_btrfs_tree_read_unlock_blocking(eb); /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. @@ -215,7 +392,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * field only matters to the lock owner. */ if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; + eb->lock_nested = false; return; } btrfs_assert_tree_read_locked(eb); @@ -223,63 +400,126 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* atomic_dec_and_test implies a barrier */ if (atomic_dec_and_test(&eb->blocking_readers)) cond_wake_up_nomb(&eb->read_lock_wq); - atomic_dec(&eb->read_locks); + btrfs_assert_tree_read_locks_put(eb); } /* - * take a spinning write lock. This will wait for both - * blocking readers or writers + * Lock for write. Wait for all blocking and spinning readers and writers. This + * starts context where reader lock could be nested by the same thread. + * + * The rwlock is held for write upon exit. */ void btrfs_tree_lock(struct extent_buffer *eb) { + u64 start_ns = 0; + + if (trace_btrfs_tree_lock_enabled()) + start_ns = ktime_get_ns(); + WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); write_lock(&eb->lock); + /* Refetch value after lock */ if (atomic_read(&eb->blocking_readers) || - atomic_read(&eb->blocking_writers)) { + READ_ONCE(eb->blocking_writers)) { write_unlock(&eb->lock); goto again; } - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - atomic_inc(&eb->write_locks); + btrfs_assert_spinning_writers_get(eb); + btrfs_assert_tree_write_locks_get(eb); eb->lock_owner = current->pid; + trace_btrfs_tree_lock(eb, start_ns); } /* - * drop a spinning or a blocking write lock. + * Release the write lock, either blocking or spinning (ie. there's no need + * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). + * This also ends the context for nesting, the read lock must have been + * released already. + * + * Tasks blocked and waiting are woken, rwlock is not held upon exit. */ void btrfs_tree_unlock(struct extent_buffer *eb) { - int blockers = atomic_read(&eb->blocking_writers); + /* + * This is read both locked and unlocked but always by the same thread + * that already owns the lock so we don't need to use READ_ONCE + */ + int blockers = eb->blocking_writers; BUG_ON(blockers > 1); btrfs_assert_tree_locked(eb); + trace_btrfs_tree_unlock(eb); eb->lock_owner = 0; - atomic_dec(&eb->write_locks); + btrfs_assert_tree_write_locks_put(eb); if (blockers) { - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_dec(&eb->blocking_writers); - /* Use the lighter barrier after atomic */ - smp_mb__after_atomic(); - cond_wake_up_nomb(&eb->write_lock_wq); + btrfs_assert_no_spinning_writers(eb); + /* Unlocked write */ + WRITE_ONCE(eb->blocking_writers, 0); + /* + * We need to order modifying blocking_writers above with + * actually waking up the sleepers to ensure they see the + * updated value of blocking_writers + */ + cond_wake_up(&eb->write_lock_wq); } else { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + btrfs_assert_spinning_writers_put(eb); write_unlock(&eb->lock); } } -void btrfs_assert_tree_locked(struct extent_buffer *eb) +/* + * Set all locked nodes in the path to blocking locks. This should be done + * before scheduling + */ +void btrfs_set_path_blocking(struct btrfs_path *p) { - BUG_ON(!atomic_read(&eb->write_locks)); + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (!p->nodes[i] || !p->locks[i]) + continue; + /* + * If we currently have a spinning reader or writer lock this + * will bump the count of blocking holders and drop the + * spinlock. + */ + if (p->locks[i] == BTRFS_READ_LOCK) { + btrfs_set_lock_blocking_read(p->nodes[i]); + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + } else if (p->locks[i] == BTRFS_WRITE_LOCK) { + btrfs_set_lock_blocking_write(p->nodes[i]); + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; + } + } } -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +/* + * This releases any locks held in the path starting at level and going all the + * way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few corner + * cases, such as COW of the block at slot zero in the node. This ignores + * those rules, and it should only be called when there are no more updates to + * be done higher up in the tree. + */ +void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { - BUG_ON(!atomic_read(&eb->read_locks)); + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 595014f64830..21a285883e89 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,8 @@ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H +#include "extent_io.h" + #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 #define BTRFS_WRITE_LOCK_BLOCKING 3 @@ -19,13 +21,20 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); void btrfs_set_lock_blocking_read(struct extent_buffer *eb); void btrfs_set_lock_blocking_write(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb); -void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { + BUG_ON(!eb->write_locks); +} +#else +static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +#endif + +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 579d53ae256f..aa9cd11f4b78 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -63,27 +63,7 @@ struct workspace { static struct workspace_manager wsm; -static void lzo_init_workspace_manager(void) -{ - btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress); -} - -static void lzo_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&wsm); -} - -static struct list_head *lzo_get_workspace(unsigned int level) -{ - return btrfs_get_workspace(&wsm, level); -} - -static void lzo_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&wsm, ws); -} - -static void lzo_free_workspace(struct list_head *ws) +void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -93,7 +73,7 @@ static void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -131,13 +111,9 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } -static int lzo_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -303,7 +279,7 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -444,10 +420,9 @@ done: return ret; } -static int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; @@ -507,20 +482,8 @@ out: return ret; } -static unsigned int lzo_set_level(unsigned int level) -{ - return 0; -} - const struct btrfs_compress_op btrfs_lzo_compress = { - .init_workspace_manager = lzo_init_workspace_manager, - .cleanup_workspace_manager = lzo_cleanup_workspace_manager, - .get_workspace = lzo_get_workspace, - .put_workspace = lzo_put_workspace, - .alloc_workspace = lzo_alloc_workspace, - .free_workspace = lzo_free_workspace, - .compress_pages = lzo_compress_pages, - .decompress_bio = lzo_decompress_bio, - .decompress = lzo_decompress, - .set_level = lzo_set_level, + .workspace_manager = &wsm, + .max_level = 1, + .default_level = 1, }; diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h deleted file mode 100644 index 75246f2f56ba..000000000000 --- a/fs/btrfs/math.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Fujitsu. All rights reserved. - * Written by Miao Xie <miaox@cn.fujitsu.com> - */ - -#ifndef BTRFS_MATH_H -#define BTRFS_MATH_H - -#include <asm/div64.h> - -static inline u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); -} - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - -#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h new file mode 100644 index 000000000000..72bab64ecf60 --- /dev/null +++ b/fs/btrfs/misc.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MISC_H +#define BTRFS_MISC_H + +#include <linux/sched.h> +#include <linux/wait.h> +#include <asm/div64.h> + +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) + +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + return div_u64(num, 10); +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + return div_u64(num, 100); +} + +/* Copy of is_power_of_two that is 64bit safe */ +static inline bool is_power_of_two_u64(u64 n) +{ + return n != 0 && (n & (n - 1)) == 0; +} + +static inline bool has_single_bit_set(u64 n) +{ + return is_power_of_two_u64(n); +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 45e3cfd1198b..fb09bc2f8e4d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,12 +7,14 @@ #include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include "misc.h" #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -195,8 +197,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); - if (dio) + if (dio) { + percpu_counter_add_batch(&fs_info->dio_bytes, len, + fs_info->delalloc_batch); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -271,13 +276,12 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &BTRFS_I(entry->inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); spin_unlock_irq(&tree->lock); @@ -469,6 +473,10 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); + if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, + fs_info->delalloc_batch); + tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); node = &entry->rb_node; @@ -539,7 +547,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); @@ -565,12 +572,11 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, return count; } -u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - u64 total_done = 0; u64 done; INIT_LIST_HEAD(&splice); @@ -590,7 +596,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); btrfs_put_fs_root(root); - total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != U64_MAX) { @@ -600,8 +605,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); - - return total_done; } /* @@ -918,14 +921,16 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len) + u8 *sum, int len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); @@ -941,10 +946,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i, - num_sectors); + memcpy(sum + index, ordered_sum->sums + i * csum_size, + num_sectors * csum_size); - index += (int)num_sectors; + index += (int)num_sectors * csum_size; if (index == len) goto out; disk_bytenr += num_sectors * sectorsize; @@ -956,6 +961,52 @@ out: return index; } +/* + * btrfs_flush_ordered_range - Lock the passed range and ensures all pending + * ordered extents in it are run to completion. + * + * @tree: IO tree used for locking out other users of the range + * @inode: Inode whose ordered tree is to be searched + * @start: Beginning of range to flush + * @end: Last byte of range to lock + * @cached_state: If passed, will return the extent state responsible for the + * locked range. It's the caller's responsibility to free the cached state. + * + * This function always returns with the given range locked, ensuring after it's + * called no order extent can be pending. + */ +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + struct extent_state *cache = NULL; + struct extent_state **cachedp = &cache; + + if (cached_state) + cachedp = cached_state; + + while (1) { + lock_extent_bits(tree, start, end, cachedp); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) { + /* + * If no external cached_state has been passed then + * decrement the extra ref taken for cachedp since we + * aren't exposing it outside of this function + */ + if (!cached_state) + refcount_dec(&cache->refs); + break; + } + unlock_extent_cached(tree, start, end, cachedp); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index fb9a161f0215..4eb0319a86d7 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -23,7 +23,7 @@ struct btrfs_ordered_sum { int len; struct list_head list; /* last field is a variable length array of csums */ - u32 sums[]; + u8 sums[]; }; /* @@ -167,8 +167,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int compress_type); -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); @@ -184,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len); + u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index df49931ffe92..873b6b694107 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->refs), eb->write_locks, atomic_read(&eb->read_locks), - atomic_read(&eb->blocking_writers), + eb->blocking_writers, atomic_read(&eb->blocking_readers), - atomic_read(&eb->spinning_writers), + eb->spinning_writers, atomic_read(&eb->spinning_readers), eb->lock_owner, current->pid); #endif @@ -189,7 +189,7 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_info(fs_info, "leaf %llu gen %llu total ptrs %d free space %d owner %llu", btrfs_header_bytenr(l), btrfs_header_generation(l), nr, - btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); + btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); @@ -266,9 +266,9 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_block_group_item); pr_info( "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", - btrfs_disk_block_group_used(l, bi), - btrfs_disk_block_group_chunk_objectid(l, bi), - btrfs_disk_block_group_flags(l, bi)); + btrfs_block_group_used(l, bi), + btrfs_block_group_chunk_objectid(l, bi), + btrfs_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 61d22a56c0ba..deb59e7cfcac 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -23,36 +23,6 @@ struct prop_handler { int inheritable; }; -static int prop_compression_validate(const char *value, size_t len); -static int prop_compression_apply(struct inode *inode, - const char *value, - size_t len); -static const char *prop_compression_extract(struct inode *inode); - -static struct prop_handler prop_handlers[] = { - { - .xattr_name = XATTR_BTRFS_PREFIX "compression", - .validate = prop_compression_validate, - .apply = prop_compression_apply, - .extract = prop_compression_extract, - .inheritable = 1 - }, -}; - -void __init btrfs_props_init(void) -{ - int i; - - hash_init(prop_handlers_ht); - - for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { - struct prop_handler *p = &prop_handlers[i]; - u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); - - hash_add(prop_handlers_ht, &p->node, h); - } -} - static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) { struct hlist_head *h; @@ -85,15 +55,9 @@ find_prop_handler(const char *name, return NULL; } -static int __btrfs_set_prop(struct btrfs_trans_handle *trans, - struct inode *inode, - const char *name, - const char *value, - size_t value_len, - int flags) +int btrfs_validate_prop(const char *name, const char *value, size_t value_len) { const struct prop_handler *handler; - int ret; if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) return -EINVAL; @@ -102,9 +66,26 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, if (!handler) return -EINVAL; + if (value_len == 0) + return 0; + + return handler->validate(value, value_len); +} + +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + if (value_len == 0) { ret = btrfs_setxattr(trans, inode, handler->xattr_name, - NULL, 0, flags); + NULL, 0, flags); if (ret) return ret; @@ -114,17 +95,14 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, return ret; } - ret = handler->validate(value, value_len); - if (ret) - return ret; - ret = btrfs_setxattr(trans, inode, handler->xattr_name, - value, value_len, flags); + ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, + value_len, flags); if (ret) return ret; ret = handler->apply(inode, value, value_len); if (ret) { - btrfs_setxattr(trans, inode, handler->xattr_name, - NULL, 0, flags); + btrfs_setxattr(trans, inode, handler->xattr_name, NULL, + 0, flags); return ret; } @@ -133,15 +111,6 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_prop(struct inode *inode, - const char *name, - const char *value, - size_t value_len, - int flags) -{ - return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); -} - static int iterate_object_props(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -283,6 +252,74 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } +static int prop_compression_validate(const char *value, size_t len) +{ + if (!value) + return 0; + + if (btrfs_compress_is_valid_type(value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, const char *value, + size_t len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, 3)) { + type = BTRFS_COMPRESS_LZO; + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { + type = BTRFS_COMPRESS_ZLIB; + } else if (!strncmp("zstd", value, 4)) { + type = BTRFS_COMPRESS_ZSTD; + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { + return -EINVAL; + } + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->prop_compress) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + default: + break; + } + + return NULL; +} + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, +}; + static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) @@ -291,6 +328,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; + bool need_reserve = false; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) @@ -299,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { const struct prop_handler *h = &prop_handlers[i]; const char *value; - u64 num_bytes; + u64 num_bytes = 0; if (!h->inheritable) continue; @@ -308,20 +346,51 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!value) continue; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - num_bytes, BTRFS_RESERVE_NO_FLUSH); - if (ret) - goto out; - ret = __btrfs_set_prop(trans, inode, h->xattr_name, - value, strlen(value), 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes); + /* + * This is not strictly necessary as the property should be + * valid, but in case it isn't, don't propagate it futher. + */ + ret = h->validate(value, strlen(value)); if (ret) - goto out; + continue; + + /* + * Currently callers should be reserving 1 item for properties, + * since we only have 1 property that we currently support. If + * we add more in the future we need to try and reserve more + * space for them. But we should also revisit how we do space + * reservations if we do add more properties in the future. + */ + if (need_reserve) { + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + return ret; + } + + ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + strlen(value), 0); + if (!ret) { + ret = h->apply(inode, value, strlen(value)); + if (ret) + btrfs_setxattr(trans, inode, h->xattr_name, + NULL, 0, 0); + else + set_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(inode)->runtime_flags); + } + + if (need_reserve) { + btrfs_block_rsv_release(fs_info, trans->block_rsv, + num_bytes); + if (ret) + return ret; + } + need_reserve = true; } - ret = 0; -out: - return ret; + + return 0; } int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, @@ -347,11 +416,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(sb, &key, parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); @@ -364,64 +433,15 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, return ret; } -static int prop_compression_validate(const char *value, size_t len) -{ - if (!strncmp("lzo", value, 3)) - return 0; - else if (!strncmp("zlib", value, 4)) - return 0; - else if (!strncmp("zstd", value, 4)) - return 0; - - return -EINVAL; -} - -static int prop_compression_apply(struct inode *inode, - const char *value, - size_t len) +void __init btrfs_props_init(void) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int type; - - if (len == 0) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; - - return 0; - } - - if (!strncmp("lzo", value, 3)) { - type = BTRFS_COMPRESS_LZO; - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (!strncmp("zlib", value, 4)) { - type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, 4)) { - type = BTRFS_COMPRESS_ZSTD; - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - } else { - return -EINVAL; - } - - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + int i; - return 0; -} + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); -static const char *prop_compression_extract(struct inode *inode) -{ - switch (BTRFS_I(inode)->prop_compress) { - case BTRFS_COMPRESS_ZLIB: - case BTRFS_COMPRESS_LZO: - case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); - default: - break; + hash_add(prop_handlers_ht, &p->node, h); } - - return NULL; } - diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 618815b4f9d5..40b2c65b518c 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -10,11 +10,10 @@ void __init btrfs_props_init(void); -int btrfs_set_prop(struct inode *inode, - const char *name, - const char *value, - size_t value_len, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, int flags); +int btrfs_validate_prop(const char *name, const char *value, size_t value_len); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e659d9d61107..39fc8c3d3a75 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -21,7 +21,7 @@ #include "backref.h" #include "extent_io.h" #include "qgroup.h" - +#include "block-group.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -918,8 +918,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* * initially create the quota tree */ - quota_root = btrfs_create_tree(trans, fs_info, - BTRFS_QUOTA_TREE_OBJECTID); + quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) { ret = PTR_ERR(quota_root); btrfs_abort_transaction(trans, ret); @@ -1101,7 +1100,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(fs_info, quota_root->node); + btrfs_clean_tree_block(quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1313,8 +1312,9 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; struct ulist *tmp; + bool found = false; int ret = 0; - int err; + int ret2; tmp = ulist_alloc(GFP_KERNEL); if (!tmp) @@ -1328,28 +1328,39 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); - if (!member || !parent) { - ret = -EINVAL; - goto out; - } + /* + * The parent/member pair doesn't exist, then try to delete the dead + * relation items only. + */ + if (!member || !parent) + goto delete_item; /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { - if (list->group == parent) - goto exist; + if (list->group == parent) { + found = true; + break; + } } - ret = -ENOENT; - goto out; -exist: + +delete_item: ret = del_qgroup_relation_item(trans, src, dst); - err = del_qgroup_relation_item(trans, dst, src); - if (err && !ret) - ret = err; + if (ret < 0 && ret != -ENOENT) + goto out; + ret2 = del_qgroup_relation_item(trans, dst, src); + if (ret2 < 0 && ret2 != -ENOENT) + goto out; - spin_lock(&fs_info->qgroup_lock); - del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); - spin_unlock(&fs_info->qgroup_lock); + /* At least one deletion succeeded, return 0 */ + if (!ret || !ret2) + ret = 0; + + if (found) { + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + spin_unlock(&fs_info->qgroup_lock); + } out: ulist_free(tmp); return ret; @@ -1800,7 +1811,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ - extent_buffer_get(src_eb); + atomic_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; @@ -2056,7 +2067,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, goto out; } /* For dst_path */ - extent_buffer_get(dst_eb); + atomic_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; @@ -2115,7 +2126,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ - extent_buffer_get(root_eb); /* For path */ + atomic_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ @@ -2412,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 nr_old_roots = 0; int ret = 0; + /* + * If quotas get disabled meanwhile, the resouces need to be freed and + * we can't just exit here. + */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return 0; + goto out_free; if (new_roots) { if (!maybe_fs_roots(new_roots)) @@ -2615,6 +2630,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int ret = 0; int i; u64 *i_qgroups; + bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; @@ -2622,7 +2638,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; - mutex_lock(&fs_info->qgroup_ioctl_lock); + /* + * There are only two callers of this function. + * + * One in create_subvol() in the ioctl context, which needs to hold + * the qgroup_ioctl_lock. + * + * The other one in create_pending_snapshot() where no other qgroup + * code can modify the fs as they all need to either start a new trans + * or hold a trans handler, thus we don't need to hold + * qgroup_ioctl_lock. + * This would avoid long and complex lock chain and make lockdep happy. + */ + spin_lock(&fs_info->trans_lock); + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) + committing = true; + spin_unlock(&fs_info->trans_lock); + + if (!committing) + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; @@ -2786,7 +2820,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (!committing) + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -3135,9 +3170,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3153,16 +3185,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3173,12 +3219,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* @@ -3196,12 +3236,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup is not enabled"); + "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup rescan is not queued"); + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } @@ -3241,10 +3281,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - memset(&fs_info->qgroup_rescan_work, 0, - sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); return 0; } @@ -3406,6 +3443,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } @@ -3450,7 +3490,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -3590,7 +3630,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, (s64)num_bytes); + trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -3637,7 +3677,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); + trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, num_bytes, type); } @@ -3787,7 +3827,7 @@ out: */ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *subvol_root, - struct btrfs_block_group_cache *bg, + struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot) @@ -3831,7 +3871,13 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, subvol_slot); block->last_snapshot = last_snapshot; block->level = level; - if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + + /* + * If we have bg == NULL, we're called from btrfs_recover_relocation(), + * no one else can modify tree blocks thus we qgroup will not change + * no matter the value of trace_leaf. + */ + if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) block->trace_leaf = true; else block->trace_leaf = false; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 46ba7bd2961c..236f12224d52 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -408,7 +408,7 @@ void btrfs_qgroup_init_swapped_blocks( void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *subvol_root, - struct btrfs_block_group_cache *bg, + struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 67a6f7d47402..a8e53c8e7b01 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -35,6 +35,22 @@ #define RBIO_CACHE_SIZE 1024 +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + spinlock_t lock; +}; + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, @@ -174,7 +190,7 @@ static void scrub_parity_work(struct btrfs_work *work); static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { - btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL); + btrfs_init_work(&rbio->work, work_func, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } @@ -655,8 +671,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) */ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) { - int bucket = rbio_bucket(rbio); - struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; unsigned long flags; @@ -664,64 +679,63 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; + h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { - spin_lock(&cur->bio_list_lock); - - /* can we steal this cached rbio's pages? */ - if (bio_list_empty(&cur->bio_list) && - list_empty(&cur->plug_list) && - test_bit(RBIO_CACHE_BIT, &cur->flags) && - !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { - list_del_init(&cur->hash_list); - refcount_dec(&cur->refs); - - steal_rbio(cur, rbio); - cache_drop = cur; - spin_unlock(&cur->bio_list_lock); + if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + continue; - goto lockit; - } + spin_lock(&cur->bio_list_lock); - /* can we merge into the lock owner? */ - if (rbio_can_merge(cur, rbio)) { - merge_rbio(cur, rbio); - spin_unlock(&cur->bio_list_lock); - freeit = rbio; - ret = 1; - goto out; - } + /* Can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + refcount_dec(&cur->refs); + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); - /* - * we couldn't merge with the running - * rbio, see if we can merge with the - * pending ones. We don't have to - * check for rmw_locked because there - * is no way they are inside finish_rmw - * right now - */ - list_for_each_entry(pending, &cur->plug_list, - plug_list) { - if (rbio_can_merge(pending, rbio)) { - merge_rbio(pending, rbio); - spin_unlock(&cur->bio_list_lock); - freeit = rbio; - ret = 1; - goto out; - } - } + goto lockit; + } - /* no merging, put us on the tail of the plug list, - * our rbio will be started with the currently - * running rbio unlocks - */ - list_add_tail(&rbio->plug_list, &cur->plug_list); + /* Can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); spin_unlock(&cur->bio_list_lock); + freeit = rbio; ret = 1; goto out; } + + + /* + * We couldn't merge with the running rbio, see if we can merge + * with the pending ones. We don't have to check for rmw_locked + * because there is no way they are inside finish_rmw right now + */ + list_for_each_entry(pending, &cur->plug_list, plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* + * No merging, put us on the tail of the plug list, our rbio + * will be started with the currently running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; } lockit: refcount_inc(&rbio->refs); @@ -1442,12 +1456,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, static void set_bio_pages_uptodate(struct bio *bio) { struct bio_vec *bvec; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) SetPageUptodate(bvec->bv_page); } @@ -1728,8 +1741,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, btrfs_rmw_helper, - unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index f5d4c13a8dbc..2503485db859 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,7 +7,7 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(struct map_lookup *map) +static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 1; @@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map) return 0; } -static inline int nr_data_stripes(struct map_lookup *map) +static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 10d9589001a9..243a2e44526e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -14,6 +14,7 @@ #include "disk-io.h" #include "transaction.h" #include "dev-replace.h" +#include "block-group.h" #undef DEBUG @@ -226,7 +227,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; u64 start; u64 end; int i; @@ -247,8 +248,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, if (!cache) return NULL; - start = cache->key.objectid; - end = start + cache->key.offset - 1; + start = cache->start; + end = start + cache->length - 1; btrfs_put_block_group(cache); zone = kzalloc(sizeof(*zone), GFP_KERNEL); @@ -638,6 +639,35 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } +static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + int ret; + + buf = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(buf)) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); + if (ret) { + free_extent_buffer_stale(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer_stale(buf); + return -EIO; + } else if (extent_buffer_uptodate(buf)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + static int reada_start_machine_dev(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; @@ -722,21 +752,19 @@ static int reada_start_machine_dev(struct btrfs_device *dev) static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; - struct btrfs_fs_info *fs_info; int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); - fs_info = rmw->fs_info; - - kfree(rmw); old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), task_nice_ioprio(current)); set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(fs_info); + __reada_start_machine(rmw->fs_info); set_task_ioprio(current, old_ioprio); - atomic_dec(&fs_info->reada_works_cnt); + atomic_dec(&rmw->fs_info->reada_works_cnt); + + kfree(rmw); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -747,6 +775,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) u64 total = 0; int i; +again: do { enqueued = 0; mutex_lock(&fs_devices->device_list_mutex); @@ -758,6 +787,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } if (enqueued == 0) return; @@ -786,8 +819,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, btrfs_readahead_helper, - reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d09b6cdb785a..b57f3618e58e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) #ifdef CONFIG_STACKTRACE static void __save_stack_trace(struct ref_action *ra) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = ra->trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - ra->trace_len = stack_trace.nr_entries; + ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); } static void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { - struct stack_trace trace; - if (ra->trace_len == 0) { btrfs_err(fs_info, " ref-verify: no stacktrace"); return; } - trace.nr_entries = ra->trace_len; - trace.entries = ra->trace; - print_stack_trace(&trace, 2); + stack_trace_print(ra->trace, ra->trace_len, 2); } #else static void inline __save_stack_trace(struct ref_action *ra) @@ -511,7 +500,7 @@ static int process_leaf(struct btrfs_root *root, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret; + int i = 0, tree_block_level = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); @@ -520,6 +509,7 @@ static int process_leaf(struct btrfs_root *root, switch (key.type) { case BTRFS_EXTENT_ITEM_KEY: *num_bytes = key.offset; + /* fall through */ case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, @@ -670,36 +660,43 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, /* * btrfs_ref_tree_mod: called when we modify a ref for a bytenr - * @root: the root we are making this modification from. - * @bytenr: the bytenr we are modifying. - * @num_bytes: number of bytes. - * @parent: the parent bytenr. - * @ref_root: the original root owner of the bytenr. - * @owner: level in the case of metadata, inode in the case of data. - * @offset: 0 for metadata, file offset for data. - * @action: the action that we are doing, this is the same as the delayed ref - * action. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and * this block entry has history we will delete all previous actions as long as * our sanity checks pass as they are no longer needed. */ -int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, u64 owner, u64 offset, - int action) +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; struct block_entry *be = NULL; struct root_entry *re = NULL; + int action = generic_ref->action; int ret = 0; - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + bool metadata; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root; + u64 owner; + u64 offset; - if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) + if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + if (generic_ref->type == BTRFS_REF_METADATA) { + ref_root = generic_ref->tree_ref.root; + owner = generic_ref->tree_ref.level; + offset = 0; + } else { + ref_root = generic_ref->data_ref.ref_root; + owner = generic_ref->data_ref.ino; + offset = generic_ref->data_ref.offset; + } + metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); if (!ra || !ref) { @@ -732,7 +729,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, INIT_LIST_HEAD(&ra->list); ra->action = action; - ra->root = root->root_key.objectid; + ra->root = generic_ref->real_root; /* * This is an allocation, preallocate the block_entry in case we haven't @@ -745,7 +742,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * is and the new root objectid, so let's not treat the passed * in root as if it really has a ref for this bytenr. */ - be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { kfree(ra); ret = PTR_ERR(be); @@ -787,13 +784,13 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * one we want to lookup below when we modify the * re->num_refs. */ - ref_root = root->root_key.objectid; - re->root_objectid = root->root_key.objectid; + ref_root = generic_ref->real_root; + re->root_objectid = generic_ref->real_root; re->num_refs = 0; } - spin_lock(&root->fs_info->ref_verify_lock); - be = lookup_block_entry(&root->fs_info->block_tree, bytenr); + spin_lock(&fs_info->ref_verify_lock); + be = lookup_block_entry(&fs_info->block_tree, bytenr); if (!be) { btrfs_err(fs_info, "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", @@ -862,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", - root->root_key.objectid, be->bytenr); + generic_ref->real_root, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); @@ -881,7 +878,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, list_add_tail(&ra->list, &be->actions); ret = 0; out_unlock: - spin_unlock(&root->fs_info->ref_verify_lock); + spin_unlock(&fs_info->ref_verify_lock); out: if (ret) btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index b7d2a4edfdb7..855de37719b5 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -9,9 +9,8 @@ #ifdef CONFIG_BTRFS_FS_REF_VERIFY int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); -int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, u64 owner, u64 offset, - int action); +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref); void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -30,9 +29,8 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) { } -static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action) +static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) { return 0; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ddf028509931..da5abd62db22 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -20,6 +20,8 @@ #include "inode-map.h" #include "qgroup.h" #include "print-tree.h" +#include "delalloc-space.h" +#include "block-group.h" /* * backref_node, mapping_node and tree_block start with this @@ -145,7 +147,7 @@ struct file_extent_cluster { struct reloc_control { /* block group to relocate */ - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; /* extent tree */ struct btrfs_root *extent_root; /* inode for moving data */ @@ -515,6 +517,34 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } +static bool reloc_root_is_dead(struct btrfs_root *root) +{ + /* + * Pair with set_bit/clear_bit in clean_dirty_subvols and + * btrfs_update_reloc_root. We need to see the updated bit before + * trying to access reloc_root + */ + smp_rmb(); + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return true; + return false; +} + +/* + * Check if this subvolume tree has valid reloc tree. + * + * Reloc tree after swap is considered dead, thus not considered as valid. + * This is enough for most callers, as they don't distinguish dead reloc root + * from no reloc root. But should_ignore_root() below is a special case. + */ +static bool have_reloc_root(struct btrfs_root *root) +{ + if (reloc_root_is_dead(root)) + return false; + if (!root->reloc_root) + return false; + return true; +} static int should_ignore_root(struct btrfs_root *root) { @@ -523,6 +553,10 @@ static int should_ignore_root(struct btrfs_root *root) if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; + /* This root has been merged with its reloc tree, we can ignore it */ + if (reloc_root_is_dead(root)) + return 1; + reloc_root = root->reloc_root; if (!reloc_root) return 0; @@ -1433,6 +1467,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (reloc_root_is_dead(root)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -1469,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *root_item; int ret; - if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || - !root->reloc_root) + if (!have_reloc_root(root)) goto out; reloc_root = root->reloc_root; @@ -1480,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + /* + * Mark the tree as dead before we change reloc_root so + * have_reloc_root will not touch it from now on. + */ + smp_wmb(); __del_reloc_root(reloc_root); } @@ -1551,11 +1596,10 @@ again: return NULL; } -static int in_block_group(u64 bytenr, - struct btrfs_block_group_cache *block_group) +static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group) { - if (bytenr >= block_group->key.objectid && - bytenr < block_group->key.objectid + block_group->key.offset) + if (bytenr >= block_group->start && + bytenr < block_group->start + block_group->length) return 1; return 0; } @@ -1643,6 +1687,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { 0 }; + cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -1703,18 +1749,23 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, root, new_bytenr, - num_bytes, parent, - btrfs_header_owner(leaf), - key.objectid, key.offset); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + num_bytes, parent); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, parent); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -1756,6 +1807,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_fs_info *fs_info = dest->fs_info; struct extent_buffer *eb; struct extent_buffer *parent; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; u64 old_bytenr; u64 new_bytenr; @@ -1916,23 +1968,31 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, src, old_bytenr, - blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, + blocksize, path->nodes[level]->start); + ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, - blocksize, 0, dest->root_key.objectid, - level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + blocksize, 0); + ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); - ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, + blocksize, path->nodes[level]->start); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + ref.skip_qgroup = true; + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); - ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, + blocksize, 0); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + ref.skip_qgroup = true; + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2161,22 +2221,35 @@ static int clean_dirty_subvols(struct reloc_control *rc) struct btrfs_root *root; struct btrfs_root *next; int ret = 0; + int ret2; list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { - struct btrfs_root *reloc_root = root->reloc_root; + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + /* Merged subvolume, cleanup its reloc root */ + struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); - list_del_init(&root->reloc_dirty_list); - root->reloc_root = NULL; - if (reloc_root) { - int ret2; + list_del_init(&root->reloc_dirty_list); + root->reloc_root = NULL; + if (reloc_root) { - ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); + if (ret2 < 0 && !ret) + ret = ret2; + } + /* + * Need barrier to ensure clear_bit() only happens after + * root->reloc_root = NULL. Pairs with have_reloc_root. + */ + smp_wmb(); + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + btrfs_put_fs_root(root); + } else { + /* Orphan reloc tree, just clean it up */ + ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); if (ret2 < 0 && !ret) ret = ret2; } - btrfs_put_fs_root(root); } return ret; } @@ -2213,7 +2286,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); - extent_buffer_get(reloc_root->node); + atomic_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { @@ -2464,6 +2537,9 @@ again: } } else { list_del_init(&reloc_root->root_list); + /* Don't forget to queue this reloc root for cleanup */ + list_add_tail(&reloc_root->reloc_dirty_list, + &rc->dirty_subvol_roots); } } @@ -2721,6 +2797,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; cond_resched(); @@ -2826,11 +2903,13 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb), - node->level, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + node->eb->start, blocksize, + upper->eb->start); + ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&ref, node->level, + btrfs_header_owner(upper->eb)); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -3156,7 +3235,6 @@ static noinline_for_stack int setup_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret = 0; @@ -3169,7 +3247,6 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); lock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -3238,6 +3315,8 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!page) { btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3258,7 +3337,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); ret = -EIO; goto out; } @@ -3280,14 +3359,14 @@ static int relocate_file_extent_cluster(struct inode *inode, } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - NULL, 0); + NULL); if (ret) { unlock_page(page); put_page(page); btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, @@ -3303,8 +3382,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } @@ -3504,7 +3582,7 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { @@ -3520,7 +3598,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) return -ENOENT; @@ -3823,7 +3901,7 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, u64 start, end, last; int ret; - last = rc->block_group->key.objectid + rc->block_group->key.offset; + last = rc->block_group->start + rc->block_group->length; while (1) { cond_resched(); if (rc->search_start >= last) { @@ -3940,7 +4018,7 @@ int prepare_to_relocate(struct reloc_control *rc) return -ENOMEM; memset(&rc->cluster, 0, sizeof(rc->cluster)); - rc->search_start = rc->block_group->key.objectid; + rc->search_start = rc->block_group->start; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; @@ -4179,7 +4257,7 @@ out: */ static noinline_for_stack struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) + struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -4206,9 +4284,9 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); BUG_ON(IS_ERR(inode)); - BTRFS_I(inode)->index_cnt = group->key.objectid; + BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: @@ -4222,7 +4300,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4234,7 +4312,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->dirty_subvol_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, + IO_TREE_RELOC_BLOCKS, NULL); return rc; } @@ -4242,7 +4321,7 @@ static struct reloc_control *alloc_reloc_control(void) * Print the block group being relocated */ static void describe_relocation(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; @@ -4250,7 +4329,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf); + block_group->start, buf); } /* @@ -4258,7 +4337,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4276,7 +4355,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) return -ETXTBSY; } - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) { btrfs_put_block_group(bg); return -ENOMEM; @@ -4285,7 +4364,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->extent_root = extent_root; rc->block_group = bg; - ret = btrfs_inc_block_group_ro(rc->block_group); + ret = btrfs_inc_block_group_ro(rc->block_group, true); if (ret) { err = ret; goto out; @@ -4298,7 +4377,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - inode = lookup_free_space_inode(fs_info, rc->block_group, path); + inode = lookup_free_space_inode(rc->block_group, path); btrfs_free_path(path); if (!IS_ERR(inode)) @@ -4323,39 +4402,48 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, U64_MAX, - rc->block_group->key.objectid, - rc->block_group->key.offset); + rc->block_group->start, + rc->block_group->length); while (1) { mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { + if (ret < 0) err = ret; - goto out; - } - - if (rc->extents_found == 0) - break; - - btrfs_info(fs_info, "found %llu extents", rc->extents_found); + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - if (ret) { + if (ret) err = ret; - goto out; - } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } + + if (err < 0) + goto out; + + if (rc->extents_found == 0) + break; + + btrfs_info(fs_info, "found %llu extents", rc->extents_found); + } WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); + WARN_ON(rc->block_group->used > 0); out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); @@ -4472,7 +4560,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) { err = -ENOMEM; goto out; @@ -4505,6 +4593,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } @@ -4594,7 +4683,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); sums->bytenr = new_bytenr; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); } out: btrfs_put_ordered_extent(ordered); @@ -4638,7 +4727,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, node->new_bytenr != buf->start); drop_node_buffer(node); - extent_buffer_get(cow); + atomic_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; @@ -4667,14 +4756,12 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { - struct btrfs_root *root; - struct reloc_control *rc; + struct btrfs_root *root = pending->root; + struct reloc_control *rc = root->fs_info->reloc_ctl; - root = pending->root; - if (!root->reloc_root) + if (!rc || !have_reloc_root(root)) return; - rc = root->fs_info->reloc_ctl; if (!rc->merge_reloc_tree) return; @@ -4703,10 +4790,10 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *reloc_root; struct btrfs_root *new_root; - struct reloc_control *rc; + struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; - if (!root->reloc_root) + if (!rc || !have_reloc_root(root)) return 0; rc = root->fs_info->reloc_ctl; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 893d12fbfda0..612411c74550 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -9,6 +9,8 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "qgroup.h" +#include "space-info.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -132,16 +134,17 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); + if (ret < 0) goto out; - } - if (ret != 0) { - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, "unable to update root key %llu %u %llu", - key->objectid, key->type, key->offset); - BUG_ON(1); + if (ret > 0) { + btrfs_crit(fs_info, + "unable to find root key (%llu %u %llu) in tree %llu", + key->objectid, key->type, key->offset, + root->root_key.objectid); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } l = path->nodes[0]; @@ -373,11 +376,13 @@ again: leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - - WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); - WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || + (btrfs_root_ref_name_len(leaf, ref) != name_len) || + memcmp_extent_buffer(leaf, name, ptr, name_len)) { + err = -ENOENT; + goto out; + } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); @@ -496,3 +501,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * use_global_rsv: allow fallback to the global block reservation + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reservation mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, int items, + bool use_global_rsv) +{ + u64 qgroup_num_bytes = 0; + u64 num_bytes; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* One for parent inode, two for dir entries */ + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); + if (ret) + return ret; + } + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); + + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +} diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a99588536c79..fd266a2d15ec 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,6 +6,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -17,6 +18,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "raid56.h" +#include "block-group.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -387,8 +389,7 @@ static struct full_stripe_lock *search_full_stripe_lock( * * Caller must ensure @cache is a RAID56 block group. */ -static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, - u64 bytenr) +static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) { u64 ret; @@ -402,8 +403,8 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, * round_down() can only handle power of 2, while RAID56 full * stripe length can be 64KiB * n, so we need to manually round down. */ - ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * - cache->full_stripe_len + cache->key.objectid; + ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * + cache->full_stripe_len + cache->start; return ret; } @@ -421,7 +422,7 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool *locked_ret) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *existing; u64 fstripe_start; @@ -468,7 +469,7 @@ out: static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool locked) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *fstripe_lock; u64 fstripe_start; @@ -596,8 +597,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, btrfs_scrub_helper, - scrub_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, + NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -1718,8 +1719,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, - scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -1787,11 +1787,12 @@ static int scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; - u32 crc = ~(u32)0; u64 len; int index; @@ -1799,6 +1800,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sblock->pagev[0]->have_csum) return 0; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; buffer = kmap_atomic(page); @@ -1808,7 +1812,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(buffer, crc, l); + crypto_shash_update(shash, buffer, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1820,7 +1824,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) buffer = kmap_atomic(page); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1832,16 +1836,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1875,7 +1882,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1889,7 +1896,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1900,18 +1907,22 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; int fail_gen = 0; int fail_cor = 0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1934,7 +1945,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1948,7 +1959,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; @@ -2136,14 +2147,13 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_write_block_to_dev_replace(sblock); } - scrub_block_put(sblock); - if (sctx->is_dev_replace && sctx->flush_all_writes) { mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); } + scrub_block_put(sblock); scrub_pending_bio_dec(sctx); } @@ -2191,8 +2201,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) raid56_add_scrub_pages(rbio, spage->page, spage->logical); } - btrfs_init_work(&sblock->work, btrfs_scrub_helper, - scrub_missing_raid56_worker, NULL, NULL); + btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2448,7 +2457,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ASSERT(index < UINT_MAX); num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); @@ -2660,18 +2669,18 @@ static int get_raid56_logic_offset(u64 physical, int num, u64 last_offset; u32 stripe_index; u32 rot; + const int data_stripes = nr_data_stripes(map); - last_offset = (physical - map->stripes[num].physical) * - nr_data_stripes(map); + last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; - for (i = 0; i < nr_data_stripes(map); i++) { + for (i = 0; i < data_stripes; i++) { *offset = last_offset + i * map->stripe_len; stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(stripe_nr, data_stripes); /* Work out the disk rotation on this stripe-set */ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); @@ -2730,8 +2739,8 @@ static void scrub_parity_bio_endio(struct bio *bio) bio_put(bio); - btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, - scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, + NULL); btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); } @@ -3079,7 +3088,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { @@ -3407,18 +3416,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; int ret = 0; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, chunk_offset, 1); + read_unlock(&map_tree->lock); if (!em) { /* @@ -3471,7 +3480,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct extent_buffer *l; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); @@ -3550,55 +3559,45 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); - ret = btrfs_inc_block_group_ro(cache); - if (!ret && sctx->is_dev_replace) { - /* - * If we are doing a device replace wait for any tasks - * that started delalloc right before we set the block - * group to RO mode, as they might have just allocated - * an extent from it or decided they could do a nocow - * write. And if any such tasks did that, wait for their - * ordered extents to complete and then commit the - * current transaction, so that we can later see the new - * extent items in the extent tree - the ordered extents - * create delayed data references (for cow writes) when - * they complete, which will be run and insert the - * corresponding extent items into the extent tree when - * we commit the transaction they used when running - * inode.c:btrfs_finish_ordered_io(). We later use - * the commit root of the extent tree to find extents - * to copy from the srcdev into the tgtdev, and we don't - * want to miss any new extents. - */ - btrfs_wait_block_group_reservations(cache); - btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, - cache->key.objectid, - cache->key.offset); - if (ret > 0) { - struct btrfs_trans_handle *trans; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - ret = PTR_ERR(trans); - else - ret = btrfs_commit_transaction(trans); - if (ret) { - scrub_pause_off(fs_info); - btrfs_put_block_group(cache); - break; - } - } - } - scrub_pause_off(fs_info); + /* + * Don't do chunk preallocation for scrub. + * + * This is especially important for SYSTEM bgs, or we can hit + * -EFBIG from btrfs_finish_chunk_alloc() like: + * 1. The only SYSTEM bg is marked RO. + * Since SYSTEM bg is small, that's pretty common. + * 2. New SYSTEM bg will be allocated + * Due to regular version will allocate new chunk. + * 3. New SYSTEM bg is empty and will get cleaned up + * Before cleanup really happens, it's marked RO again. + * 4. Empty SYSTEM bg get scrubbed + * We go back to 2. + * + * This can easily boost the amount of SYSTEM chunks if cleaner + * thread can't be triggered fast enough, and use up all space + * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. + */ + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. - * It is not a problem for scrub/replace, because + * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. */ @@ -3607,10 +3606,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_warn(fs_info, "failed setting block group ro: %d", ret); btrfs_put_block_group(cache); + scrub_pause_off(fs_info); break; } - down_write(&fs_info->dev_replace.rwsem); + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); + down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -3651,10 +3663,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - down_write(&fs_info->dev_replace.rwsem); + down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - up_write(&fs_info->dev_replace.rwsem); + up_write(&dev_replace->rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3668,7 +3680,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ spin_lock(&cache->lock); if (!cache->removed && !cache->ro && cache->reserved == 0 && - btrfs_block_group_used(&cache->item) == 0) { + cache->used == 0) { spin_unlock(&cache->lock); btrfs_mark_bg_unused(cache); } else { @@ -3791,7 +3803,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) - return -EINVAL; + return -EAGAIN; if (fs_info->nodesize > BTRFS_STRIPE_LEN) { /* @@ -3999,9 +4011,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +int btrfs_scrub_cancel_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7ea2d6b1f170..091e5bc8c7ea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,6 +25,14 @@ #include "compression.h" /* + * Maximum number of references an extent can have in order for us to attempt to + * issue clone operations instead of write operations. This currently exists to + * avoid hitting limitations of the backreference walking code (taking a lot of + * time and using too much memory for extents with large number of references). + */ +#define SEND_MAX_EXTENT_REFS 64 + +/* * A fs_path is a helper to dynamically build path names with unknown size. * It reallocates the internal buffer on demand. * It allows fast adding of path elements on the right side (normal path) and @@ -260,6 +268,21 @@ struct name_cache_entry { char name[]; }; +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); + __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, enum btrfs_compare_tree_result result, @@ -686,7 +709,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -1160,7 +1183,6 @@ out: struct backref_ctx { struct send_ctx *sctx; - struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1213,8 +1235,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) { struct backref_ctx *bctx = ctx_; struct clone_root *found; - int ret; - u64 i_size; /* First check if the root is in the list of accepted clone sources */ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, @@ -1231,30 +1251,25 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } /* - * There are inodes that have extents that lie behind its i_size. Don't - * accept clones from these extents. - */ - ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, - NULL, NULL, NULL); - btrfs_release_path(bctx->path); - if (ret < 0) - return ret; - - if (offset + bctx->data_offset + bctx->extent_len > i_size) - return 0; - - /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ if (found->root == bctx->sctx->send_root) { /* - * TODO for the moment we don't accept clones from the inode - * that is currently send. We may change this when - * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same - * file. + * If the source inode was not yet processed we can't issue a + * clone operation, as the source extent does not exist yet at + * the destination of the stream. + */ + if (ino > bctx->cur_objectid) + return 0; + /* + * We clone from the inode currently being sent as long as the + * source extent is already processed, otherwise we could try + * to clone from an extent that does not exist yet at the + * destination of the stream. */ - if (ino >= bctx->cur_objectid) + if (ino == bctx->cur_objectid && + offset >= bctx->sctx->cur_inode_next_write_offset) return 0; } @@ -1303,6 +1318,7 @@ static int find_extent_clone(struct send_ctx *sctx, struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; + struct btrfs_extent_item *ei; int compressed; u32 i; @@ -1319,8 +1335,6 @@ static int find_extent_clone(struct send_ctx *sctx, goto out; } - backref_ctx->path = tmp_path; - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1352,7 +1366,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = extent_from_logical(fs_info, disk_byte, tmp_path, &found_key, &flags); up_read(&fs_info->commit_root_sem); - btrfs_release_path(tmp_path); if (ret < 0) goto out; @@ -1361,6 +1374,21 @@ static int find_extent_clone(struct send_ctx *sctx, goto out; } + ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], + struct btrfs_extent_item); + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { + ret = -ENOENT; + goto out; + } + btrfs_release_path(tmp_path); + /* * Setup the clone roots. */ @@ -4782,7 +4810,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5017,6 +5045,12 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (offset >= sctx->cur_inode_size) return 0; + /* + * Don't go beyond the inode's i_size due to prealloc extents that start + * after the i_size. + */ + end = min_t(u64, end, sctx->cur_inode_size); + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); @@ -5082,6 +5116,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector @@ -5107,6 +5142,16 @@ static int clone_range(struct send_ctx *sctx, return -ENOMEM; /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = __get_inode_info(clone_root->root, path, clone_root->ino, + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + btrfs_release_path(path); + if (ret < 0) + goto out; + + /* * We can't send a clone operation for the entire range if we find * extent items in the respective range in the source file that * refer to different extents or if we find holes. @@ -5148,6 +5193,7 @@ static int clone_range(struct send_ctx *sctx, u8 type; u64 ext_len; u64 clone_len; + u64 clone_data_offset; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5201,13 +5247,73 @@ static int clone_range(struct send_ctx *sctx, if (key.offset >= clone_root->offset + len) break; + if (key.offset >= clone_src_i_size) + break; + + if (key.offset + ext_len > clone_src_i_size) + ext_len = clone_src_i_size - key.offset; + + clone_data_offset = btrfs_file_extent_offset(leaf, ei); + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { + clone_root->offset = key.offset; + if (clone_data_offset < data_offset && + clone_data_offset + ext_len > data_offset) { + u64 extent_offset; + + extent_offset = data_offset - clone_data_offset; + ext_len -= extent_offset; + clone_data_offset += extent_offset; + clone_root->offset += extent_offset; + } + } + clone_len = min_t(u64, ext_len, len); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && - btrfs_file_extent_offset(leaf, ei) == data_offset) - ret = send_clone(sctx, offset, clone_len, clone_root); - else + clone_data_offset == data_offset) { + const u64 src_end = clone_root->offset + clone_len; + const u64 sectorsize = SZ_64K; + + /* + * We can't clone the last block, when its size is not + * sector size aligned, into the middle of a file. If we + * do so, the receiver will get a failure (-EINVAL) when + * trying to clone or will silently corrupt the data in + * the destination file if it's on a kernel without the + * fix introduced by commit ac765f83f1397646 + * ("Btrfs: fix data corruption due to cloning of eof + * block). + * + * So issue a clone of the aligned down range plus a + * regular write for the eof block, if we hit that case. + * + * Also, we use the maximum possible sector size, 64K, + * because we don't know what's the sector size of the + * filesystem that receives the stream, so we have to + * assume the largest possible sector size. + */ + if (src_end == clone_src_i_size && + !IS_ALIGNED(src_end, sectorsize) && + offset + clone_len < sctx->cur_inode_size) { + u64 slen; + + slen = ALIGN_DOWN(src_end - clone_root->offset, + sectorsize); + if (slen > 0) { + ret = send_clone(sctx, offset, slen, + clone_root); + if (ret < 0) + goto out; + } + ret = send_extent_data(sctx, offset + slen, + clone_len - slen); + } else { + ret = send_clone(sctx, offset, clone_len, + clone_root); + } + } else { ret = send_extent_data(sctx, offset, clone_len); + } if (ret < 0) goto out; @@ -6262,68 +6368,21 @@ static int changed_extent(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { - - if (result == BTRFS_COMPARE_TREE_CHANGED) { - struct extent_buffer *leaf_l; - struct extent_buffer *leaf_r; - struct btrfs_file_extent_item *ei_l; - struct btrfs_file_extent_item *ei_r; - - leaf_l = sctx->left_path->nodes[0]; - leaf_r = sctx->right_path->nodes[0]; - ei_l = btrfs_item_ptr(leaf_l, - sctx->left_path->slots[0], - struct btrfs_file_extent_item); - ei_r = btrfs_item_ptr(leaf_r, - sctx->right_path->slots[0], - struct btrfs_file_extent_item); - - /* - * We may have found an extent item that has changed - * only its disk_bytenr field and the corresponding - * inode item was not updated. This case happens due to - * very specific timings during relocation when a leaf - * that contains file extent items is COWed while - * relocation is ongoing and its in the stage where it - * updates data pointers. So when this happens we can - * safely ignore it since we know it's the same extent, - * but just at different logical and physical locations - * (when an extent is fully replaced with a new one, we - * know the generation number must have changed too, - * since snapshot creation implies committing the current - * transaction, and the inode item must have been updated - * as well). - * This replacement of the disk_bytenr happens at - * relocation.c:replace_file_extents() through - * relocation.c:btrfs_reloc_cow_block(). - */ - if (btrfs_file_extent_generation(leaf_l, ei_l) == - btrfs_file_extent_generation(leaf_r, ei_r) && - btrfs_file_extent_ram_bytes(leaf_l, ei_l) == - btrfs_file_extent_ram_bytes(leaf_r, ei_r) && - btrfs_file_extent_compression(leaf_l, ei_l) == - btrfs_file_extent_compression(leaf_r, ei_r) && - btrfs_file_extent_encryption(leaf_l, ei_l) == - btrfs_file_extent_encryption(leaf_r, ei_r) && - btrfs_file_extent_other_encoding(leaf_l, ei_l) == - btrfs_file_extent_other_encoding(leaf_r, ei_r) && - btrfs_file_extent_type(leaf_l, ei_l) == - btrfs_file_extent_type(leaf_r, ei_r) && - btrfs_file_extent_disk_bytenr(leaf_l, ei_l) != - btrfs_file_extent_disk_bytenr(leaf_r, ei_r) && - btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) == - btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) && - btrfs_file_extent_offset(leaf_l, ei_l) == - btrfs_file_extent_offset(leaf_r, ei_r) && - btrfs_file_extent_num_bytes(leaf_l, ei_l) == - btrfs_file_extent_num_bytes(leaf_r, ei_r)) - return 0; - } - - inconsistent_snapshot_error(sctx, result, "extent"); - return -EIO; - } + /* + * We have found an extent item that changed without the inode item + * having changed. This can happen either after relocation (where the + * disk_bytenr of an extent item is replaced at + * relocation.c:replace_file_extents()) or after deduplication into a + * file in both the parent and send snapshots (where an extent item can + * get modified or replaced with a new one). Note that deduplication + * updates the inode item, but it only changes the iversion (sequence + * field in the inode item) of the inode, so if a file is deduplicated + * the same amount of times in both the parent and send snapshots, its + * iversion becames the same in both snapshots, whence the inode item is + * the same on both snapshots. + */ + if (sctx->cur_ino != sctx->cmp_key->objectid) + return 0; if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) @@ -6501,6 +6560,366 @@ out: return ret; } +static int tree_move_down(struct btrfs_path *path, int *level) +{ + struct extent_buffer *eb; + + BUG_ON(*level == 0); + eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; + path->slots[*level - 1] = 0; + (*level)--; + return 0; +} + +static int tree_move_next_or_upnext(struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(path, level, root_level); + } else { + ret = tree_move_down(path, level); + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +static int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + struct btrfs_fs_info *fs_info = left_root->fs_info; + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + up_read(&fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret == -1) + left_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret == -1) + right_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_path, right_path, + tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kvfree(tmp_buf); + return ret; +} + static int send_subvol(struct send_ctx *sctx) { int ret; @@ -6579,6 +6998,38 @@ commit_trans: return btrfs_commit_transaction(trans); } +/* + * Make sure any existing dellaloc is flushed for any root used by a send + * operation so that we do not miss any data and we do not race with writeback + * finishing and changing a tree while send is using the tree. This could + * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and + * a send operation then uses the subvolume. + * After flushing delalloc ensure_commit_roots_uptodate() must be called. + */ +static int flush_delalloc_roots(struct send_ctx *sctx) +{ + struct btrfs_root *root = sctx->parent_root; + int ret; + int i; + + if (root) { + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + return 0; +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -6594,6 +7045,13 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) spin_unlock(&root->root_item_lock); } +static void dedupe_in_progress_warn(const struct btrfs_root *root) +{ + btrfs_warn_rl(root->fs_info, +"cannot use root %llu for send while deduplications on it are in progress (%d in progress)", + root->root_key.objectid, root->dedupe_in_progress); +} + long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; @@ -6617,16 +7075,15 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); + if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + dedupe_in_progress_warn(send_root); + spin_unlock(&send_root->root_item_lock); + return -EAGAIN; + } send_root->send_in_progress++; spin_unlock(&send_root->root_item_lock); /* - * This is done when we lookup the root, it should already be complete - * by the time we get here. - */ - WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ @@ -6751,6 +7208,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = -EPERM; goto out; } + if (clone_root->dedupe_in_progress) { + dedupe_in_progress_warn(clone_root); + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EAGAIN; + goto out; + } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -6785,6 +7249,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = -EPERM; goto out; } + if (sctx->parent_root->dedupe_in_progress) { + dedupe_in_progress_warn(sctx->parent_root); + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EAGAIN; + goto out; + } spin_unlock(&sctx->parent_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -6803,13 +7274,31 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) NULL); sort_clone_roots = 1; + ret = flush_delalloc_roots(sctx); + if (ret) + goto out; + ret = ensure_commit_roots_uptodate(sctx); if (ret) goto out; + mutex_lock(&fs_info->balance_mutex); + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + btrfs_warn_rl(fs_info, + "cannot run send because a balance operation is in progress"); + ret = -EAGAIN; + goto out; + } + fs_info->send_in_progress++; + mutex_unlock(&fs_info->balance_mutex); + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; + mutex_lock(&fs_info->balance_mutex); + fs_info->send_in_progress--; + mutex_unlock(&fs_info->balance_mutex); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 000000000000..f09aa6ee9113 --- /dev/null +++ b/fs/btrfs/space-info.c @@ -0,0 +1,1115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "space-info.h" +#include "sysfs.h" +#include "volumes.h" +#include "free-space-cache.h" +#include "ordered-data.h" +#include "transaction.h" +#include "block-group.h" + +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + (may_use_included ? s_info->bytes_may_use : 0); +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); + if (ret) { + kfree(space_info); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = btrfs_sysfs_add_space_info_type(info, space_info); + if (ret) + return ret; + + list_add_rcu(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return -EINVAL; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } +out: + return ret; +} + +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + factor = btrfs_bg_type_to_factor(flags); + + found = btrfs_find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + btrfs_try_granting_tickets(info, found); + spin_unlock(&found->lock); + *space_info = found; +} + +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + +static int can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + u64 profile; + u64 avail; + u64 used; + int factor; + + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + used = btrfs_space_info_used(space_info, true); + avail = atomic64_read(&fs_info->free_chunk_space); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct list_head *head; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + + lockdep_assert_held(&space_info->lock); + + head = &space_info->priority_tickets; +again: + while (!list_empty(head)) { + struct reserve_ticket *ticket; + u64 used = btrfs_space_info_used(space_info, true); + + ticket = list_first_entry(head, struct reserve_ticket, list); + + /* Check and see if our ticket can be satisified now. */ + if ((used + ticket->bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, ticket->bytes, flush, + false)) { + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + ticket->bytes); + list_del_init(&ticket->list); + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + break; + } + } + + if (head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } +} + +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + +static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info) +{ + lockdep_assert_held(&info->lock); + + btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + info->flags, + info->total_bytes - btrfs_space_info_used(info, true), + info->full ? "" : "not "); + btrfs_info(fs_info, + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); + + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + +} + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group *cache; + int index = 0; + + spin_lock(&info->lock); + __btrfs_dump_space_info(fs_info, info); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + btrfs_info(fs_info, + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(fs_info, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); + } +} + +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + u64 bytes; + u64 nr; + + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM SZ_256K + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) +{ + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 dio_bytes; + u64 async_pages; + u64 items; + long time_left; + unsigned long nr_pages; + int loops; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(fs_info, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + return; + } + + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + + loops = 0; + while ((delalloc_bytes || dio_bytes) && loops < 3) { + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + + /* + * We need to wait for the compressed pages to start before + * we continue. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; + else + async_pages -= nr_pages; + + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + (int)async_pages); +skip_async: + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + } +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket = NULL; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_trans_handle *trans; + u64 bytes_needed; + u64 reclaim_bytes = 0; + u64 cur_free_bytes = 0; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + spin_lock(&space_info->lock); + cur_free_bytes = btrfs_space_info_used(space_info, true); + if (cur_free_bytes < space_info->total_bytes) + cur_free_bytes = space_info->total_bytes - cur_free_bytes; + else + cur_free_bytes = 0; + + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes_needed = (ticket) ? ticket->bytes : 0; + + if (bytes_needed > cur_free_bytes) + bytes_needed -= cur_free_bytes; + else + bytes_needed = 0; + spin_unlock(&space_info->lock); + + if (!bytes_needed) + return 0; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + goto enospc; + + spin_lock(&delayed_rsv->lock); + reclaim_bytes += delayed_rsv->reserved; + spin_unlock(&delayed_rsv->lock); + + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; + +commit: + return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; +} + +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 num_bytes, + int state) +{ + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, nr); + btrfs_end_transaction(trans); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_chunk_alloc(trans, + btrfs_metadata_alloc_profile(fs_info), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; + case RUN_DELAYED_IPUTS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_wait_on_delayed_iputs(fs_info); + break; + case COMMIT_TRANS: + ret = may_commit_transaction(fs_info, space_info); + break; + default: + ret = -ENOSPC; + break; + } + + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret); + return; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) +{ + struct reserve_ticket *ticket; + u64 used; + u64 expected; + u64 to_reclaim = 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + return 0; + + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + return 0; + + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +/* + * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * @fs_info - fs_info for this fs + * @space_info - the space info we were flushing + * + * We call this when we've exhausted our flushing ability and haven't made + * progress in satisfying tickets. The reservation code handles tickets in + * order, so if there is a large ticket first and then smaller ones we could + * very well satisfy the smaller tickets. This will attempt to wake up any + * tickets in the list to catch this case. + * + * This function returns true if it was able to make progress by clearing out + * other tickets, or if it stumbles across a ticket that was smaller than the + * first ticket. + */ +static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket; + u64 tickets_id = space_info->tickets_id; + u64 first_ticket_bytes = 0; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); + __btrfs_dump_space_info(fs_info, space_info); + } + + while (!list_empty(&space_info->tickets) && + tickets_id == space_info->tickets_id) { + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + + /* + * may_commit_transaction will avoid committing the transaction + * if it doesn't feel like the space reclaimed by the commit + * would result in the ticket succeeding. However if we have a + * smaller ticket in the queue it may be small enough to be + * satisified by committing the transaction, so if any + * subsequent ticket is smaller than the first ticket go ahead + * and send us back for another loop through the enospc flushing + * code. + */ + if (first_ticket_bytes == 0) + first_ticket_bytes = ticket->bytes; + else if (first_ticket_bytes > ticket->bytes) + return true; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); + + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); + + /* + * We're just throwing tickets away, so more flushing may not + * trip over btrfs_try_granting_tickets, so we need to call it + * here to see if we can make progress with the next ticket in + * the list. + */ + btrfs_try_granting_tickets(fs_info, space_info); + } + return (tickets_id != space_info->tickets_id); +} + +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + int commit_cycles = 0; + u64 last_tickets_id; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info, space_info, to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + if (maybe_fail_all_tickets(fs_info, space_info)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + +static const enum btrfs_flush_state evict_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + FLUSH_DELAYED_REFS_NR, + FLUSH_DELAYED_REFS, + FLUSH_DELALLOC, + FLUSH_DELALLOC_WAIT, + ALLOC_CHUNK, + COMMIT_TRANS, +}; + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + u64 to_reclaim; + int flush_state; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + flush_state = 0; + do { + flush_space(fs_info, space_info, to_reclaim, states[flush_state]); + flush_state++; + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } while (flush_state < states_nr); +} + +static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) + +{ + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + /* + * Delete us from the list. After we unlock the space + * info, we don't want the async reclaim job to reserve + * space for this ticket. If that would happen, then the + * ticket's task would not known that space was reserved + * despite getting an error, resulting in a space leak + * (bytes_may_use counter of our space_info). + */ + list_del_init(&ticket->list); + ticket->error = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + spin_unlock(&space_info->lock); +} + +/** + * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket + * @fs_info - the fs + * @space_info - the space_info for the reservation + * @ticket - the ticket for the reservation + * @flush - how much we can flush + * + * This does the work of figuring out how to flush for the ticket, waiting for + * the reservation, and returning the appropriate error if there is one. + */ +static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + switch (flush) { + case BTRFS_RESERVE_FLUSH_ALL: + wait_reserve_ticket(fs_info, space_info, ticket); + break; + case BTRFS_RESERVE_FLUSH_LIMIT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_flush_states, + ARRAY_SIZE(priority_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_EVICT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + evict_flush_states, + ARRAY_SIZE(evict_flush_states)); + break; + default: + ASSERT(0); + break; + } + + spin_lock(&space_info->lock); + ret = ticket->error; + if (ticket->bytes || ticket->error) { + /* + * Need to delete here for priority tickets. For regular tickets + * either the async reclaim job deletes the ticket from the list + * or we delete it ourselves at wait_reserve_ticket(). + */ + list_del_init(&ticket->list); + if (!ret) + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket->list)); + /* + * Check that we can't have an error set if the reservation succeeded, + * as that would confuse tasks and lead them to error out without + * releasing reserved space (if an error happens the expectation is that + * space wasn't reserved at all). + */ + ASSERT(!(ticket->bytes == 0 && ticket->error)); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @space_info - the space info we want to allocate from + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct reserve_ticket ticket; + u64 used; + int ret = 0; + bool pending_tickets; + + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + + spin_lock(&space_info->lock); + ret = -ENOSPC; + used = btrfs_space_info_used(space_info, true); + pending_tickets = !list_empty(&space_info->tickets) || + !list_empty(&space_info->priority_tickets); + + /* + * Carry on if we have enough space (short-circuit) OR call + * can_overcommit() to ensure we can overcommit to continue. + */ + if (!pending_tickets && + ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk))) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + + /* + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. + * + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && + !work_busy(&fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } + spin_unlock(&space_info->lock); + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + return ret; + + return handle_reserve_ticket(fs_info, space_info, &ticket, flush); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool system_chunk = (root == fs_info->chunk_root); + + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + if (block_rsv != global_rsv && + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + block_rsv->space_info->flags, + orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 000000000000..1a349e3f9cc1 --- /dev/null +++ b/fs/btrfs/space-info.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SPACE_INFO_H +#define BTRFS_SPACE_INFO_H + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed every time we + * call btrfs_free_extent so it is a realtime count of what will be + * freed once the transaction is committed. It will be zeroed every + * time the transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ + u64 tickets_id; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +/* + * + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ +static inline void \ +btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ + lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + trace_btrfs_space_reservation(fs_info, trace_name, \ + sinfo->flags, abs_bytes, \ + bytes > 0); \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info); +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); + +static inline void btrfs_space_info_free_bytes_may_use( + struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + spin_lock(&space_info->lock); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +#endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 4c13b737f568..73f7987143df 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -33,6 +33,8 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The extent buffer api is used to do the page spanning work required to * have a metadata blocksize different from the page size. + * + * There are 2 variants defined, one with a token pointer and one without. */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ @@ -50,8 +52,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ int size = sizeof(u##bits); \ u##bits res; \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -68,11 +72,33 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ res = get_unaligned_le##bits(p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + return res; \ +} \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ @@ -89,8 +115,10 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ unsigned long map_len; \ int size = sizeof(u##bits); \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -108,11 +136,32 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ put_unaligned_le##bits(val, p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ +} \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ + return; \ } \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 120e4340792a..f452a94abdc3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,7 +42,10 @@ #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" +#include "space-info.h" +#include "sysfs.h" #include "tests/btrfs-tests.h" +#include "block-group.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -63,7 +66,7 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -const char *btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -184,7 +187,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -1216,7 +1219,7 @@ static int btrfs_fill_super(struct super_block *sb, key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -1400,7 +1403,7 @@ static inline int is_subvolume_inode(struct inode *inode) } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, - const char *device_name, struct vfsmount *mnt) + struct vfsmount *mnt) { struct dentry *root; int ret; @@ -1553,6 +1556,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; + if (!strstr(crc32c_impl(), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -1601,14 +1606,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct vfsmount *mnt_root; struct dentry *root; - fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - error = btrfs_parse_subvol_options(data, &subvol_name, &subvol_objectid); if (error) { @@ -1649,7 +1650,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root); + root = mount_subvol(subvol_name, subvol_objectid, mnt_root); out: return root; @@ -1668,7 +1669,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); @@ -1900,12 +1900,12 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 skip_space; u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; + int num_stripes = 1; int i = 0, nr_devices; + const struct btrfs_raid_attr *rattr; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1929,21 +1929,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + + if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1C3) + num_stripes = 3; + else if (type & BTRFS_BLOCK_GROUP_RAID1C4) + num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; - } - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { @@ -1959,28 +1959,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); - avail_space *= BTRFS_STRIPE_LEN; + avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. + * + * This ensures we have at least min_stripe_size free space + * after excluding 1MB. */ - skip_space = SZ_1M; - - /* - * we can use the free space in [0, skip_space - 1], subtract - * it from the total. - */ - if (avail_space && avail_space >= skip_space) - avail_space -= skip_space; - else - avail_space = 0; - - if (avail_space < min_stripe_size) + if (avail_space <= SZ_1M + min_stripe_size) continue; + avail_space -= SZ_1M; + devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -1994,9 +1987,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i = nr_devices - 1; avail_space = 0; - while (nr_devices >= min_stripes) { - if (num_stripes > nr_devices) - num_stripes = nr_devices; + while (nr_devices >= rattr->devs_min) { + num_stripes = min(num_stripes, nr_devices); if (devices_info[i].max_avail >= min_stripe_size) { int j; @@ -2033,7 +2025,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = fs_info->super_copy; - struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; @@ -2047,7 +2038,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) int mixed = 0; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry_rcu(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2298,6 +2289,7 @@ static const struct super_operations btrfs_super_ops = { .show_devname = btrfs_show_devname, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, + .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, @@ -2307,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = { static const struct file_operations btrfs_ctl_fops = { .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, - .compat_ioctl = btrfs_control_ioctl, + .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; @@ -2370,10 +2362,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_cachep; - err = extent_map_init(); + err = extent_state_cache_init(); if (err) goto free_extent_io; + err = extent_map_init(); + if (err) + goto free_extent_state_cache; + err = ordered_data_init(); if (err) goto free_extent_map; @@ -2432,6 +2428,8 @@ free_ordered_data: ordered_data_exit(); free_extent_map: extent_map_exit(); +free_extent_state_cache: + extent_state_cache_exit(); free_extent_io: extent_io_exit(); free_cachep: @@ -2452,6 +2450,7 @@ static void __exit exit_btrfs_fs(void) btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); + extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); btrfs_end_io_wq_exit(); @@ -2465,3 +2464,7 @@ late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: xxhash64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: blake2b-256"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5a5930e3d32b..5ebbe8a5ee76 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -4,22 +4,88 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> -#include <linux/kobject.h> #include <linux/bug.h> -#include <linux/debugfs.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" +#include "space-info.h" +#include "block-group.h" + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +/* For raid type sysfs entries */ +struct raid_kobject { + u64 flags; + struct kobject kobj; +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( + struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { @@ -193,6 +259,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -207,6 +274,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(no_holes), BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), + BTRFS_FEAT_ATTR_PTR(raid1c34), NULL }; @@ -230,8 +298,30 @@ static ssize_t rmdir_subvol_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); +static ssize_t supported_checksums_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < btrfs_get_num_csums(); i++) { + /* + * This "trick" only works as long as 'enum btrfs_csum_type' has + * no holes in it + */ + ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (i == 0 ? "" : " "), btrfs_super_csum_name(i)); + + } + + ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), + BTRFS_ATTR_PTR(static_feature, supported_checksums), NULL }; @@ -246,6 +336,25 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; +#ifdef CONFIG_BTRFS_DEBUG + +/* + * Runtime debugging exported via sysfs + * + * /sys/fs/btrfs/debug - applies to module or all filesystems + * /sys/fs/btrfs/UUID - applies only to the given filesystem + */ +static struct attribute *btrfs_debug_feature_attrs[] = { + NULL +}; + +static const struct attribute_group btrfs_debug_feature_attr_group = { + .name = "debug", + .attrs = btrfs_debug_feature_attrs, +}; + +#endif + static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; @@ -288,36 +397,37 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) - val += block_group->key.offset; + val += block_group->length; else - val += btrfs_block_group_used(&block_group->item); + val += block_group->used; } up_read(&sinfo->groups_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", val); } -static struct attribute *raid_attributes[] = { +static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), NULL }; +ATTRIBUTE_GROUPS(raid); static void release_raid_kobj(struct kobject *kobj) { kfree(to_raid_kobj(kobj)); } -struct kobj_type btrfs_raid_ktype = { +static struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, - .default_attrs = raid_attributes, + .default_groups = raid_groups, }; #define SPACE_INFO_ATTR(field) \ @@ -364,6 +474,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; +ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { @@ -372,10 +483,10 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -struct kobj_type space_info_ktype = { +static struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, - .default_attrs = space_info_attrs, + .default_groups = space_info_groups, }; static const struct attribute *allocation_attrs[] = { @@ -518,6 +629,19 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); +static ssize_t btrfs_checksum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); + + return snprintf(buf, PAGE_SIZE, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); +} + +BTRFS_ATTR(, checksum, btrfs_checksum_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -525,6 +649,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), + BTRFS_ATTR_PTR(, checksum), NULL, }; @@ -652,12 +777,17 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } -const char * const btrfs_feature_set_names[FEAT_MAX] = { +static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +{ + return btrfs_feature_set_names[set]; +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ @@ -727,6 +857,110 @@ static void init_feature_attrs(void) } } +/* + * Create a sysfs entry for a given block group type at path + * /sys/fs/btrfs/UUID/allocation/data/TYPE + */ +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; + struct raid_kobject *rkobj; + const int index = btrfs_bg_flags_to_raid_index(cache->flags); + unsigned int nofs_flag; + int ret; + + /* + * Setup a NOFS context because kobject_add(), deep in its call chain, + * does GFP_KERNEL allocations, and we are often called in a context + * where if reclaim is triggered we can deadlock (we are either holding + * a transaction handle or some lock required for a transaction + * commit). + */ + nofs_flag = memalloc_nofs_save(); + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + memalloc_nofs_restore(nofs_flag); + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; + } + + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", + btrfs_bg_type_to_raid_name(rkobj->flags)); + memalloc_nofs_restore(nofs_flag); + if (ret) { + kobject_put(&rkobj->kobj); + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); + return; + } + + space_info->block_group_kobjs[index] = &rkobj->kobj; +} + +/* + * Remove sysfs directories for all block group types of a given space info and + * the space info as well + */ +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) +{ + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +/* + * Create a sysfs entry for a space info type at path + * /sys/fs/btrfs/UUID/allocation/TYPE + */ +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + int ret; + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + fs_info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + return 0; +} + /* when one_device is NULL, it removes all device links */ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -803,14 +1037,34 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, return error; } -/* /sys/fs/btrfs/ entry */ -static struct kset *btrfs_kset; +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} -/* /sys/kernel/debug/btrfs */ -static struct dentry *btrfs_debugfs_root_dentry; +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + + /* + * Sprouting changes fsid of the mounted filesystem, rename the fsid + * directory + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) + btrfs_warn(fs_devices->fs_info, + "sysfs: failed to create fsid for sprout"); +} -/* Debugging tunables and exported data */ -u64 btrfs_debugfs_test; +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; /* * Can be called by the device discovery thread. @@ -825,7 +1079,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); - return error; + if (error) { + kobject_put(&fs_devs->fsid_kobj); + return error; + } + + return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) @@ -851,6 +1110,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; +#ifdef CONFIG_BTRFS_DEBUG + error = sysfs_create_group(fsid_kobj, + &btrfs_debug_feature_attr_group); + if (error) + goto failure; +#endif + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; @@ -905,28 +1171,6 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); } -static int btrfs_init_debugfs(void) -{ -#ifdef CONFIG_DEBUG_FS - btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); - if (!btrfs_debugfs_root_dentry) - return -ENOMEM; - - /* - * Example code, how to export data through debugfs. - * - * file: /sys/kernel/debug/btrfs/test - * contents of: btrfs_debugfs_test - */ -#ifdef CONFIG_BTRFS_DEBUG - debugfs_create_u64("test", S_IRUGO | S_IWUSR, btrfs_debugfs_root_dentry, - &btrfs_debugfs_test); -#endif - -#endif - return 0; -} - int __init btrfs_init_sysfs(void) { int ret; @@ -935,10 +1179,6 @@ int __init btrfs_init_sysfs(void) if (!btrfs_kset) return -ENOMEM; - ret = btrfs_init_debugfs(); - if (ret) - goto out1; - init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) @@ -948,13 +1188,17 @@ int __init btrfs_init_sysfs(void) if (ret) goto out_remove_group; +#ifdef CONFIG_BTRFS_DEBUG + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); + if (ret) + goto out2; +#endif + return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: - debugfs_remove_recursive(btrfs_debugfs_root_dentry); -out1: kset_unregister(btrfs_kset); return ret; @@ -966,6 +1210,5 @@ void __cold btrfs_exit_sysfs(void) &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); - debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 40716b357c1d..e10c3adfc30f 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,10 +3,7 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H -/* - * Data exported through sysfs - */ -extern u64 btrfs_debugfs_test; +#include <linux/kobject.h> enum btrfs_feature_set { FEAT_COMPAT, @@ -15,71 +12,8 @@ enum btrfs_feature_set { FEAT_MAX }; -#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = { .name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -} - -#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0644, _show, _store) - -#define BTRFS_ATTR(_prefix, _name, _show) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_ATTR_PTR(_prefix, _name) \ - (&btrfs_attr_##_prefix##_##_name.attr) - - -struct btrfs_feature_attr { - struct kobj_attribute kobj_attr; - enum btrfs_feature_set feature_set; - u64 feature_bit; -}; - -#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ - .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ - btrfs_feature_attr_show, \ - btrfs_feature_attr_store), \ - .feature_set = _feature_set, \ - .feature_bit = _feature_prefix ##_## _feature_bit, \ -} -#define BTRFS_FEAT_ATTR_PTR(_name) \ - (&btrfs_attr_features_##_name.kobj_attr.attr) - -#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) -#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) -#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) - -/* convert from attribute */ -static inline struct btrfs_feature_attr * -to_btrfs_feature_attr(struct kobj_attribute *a) -{ - return container_of(a, struct btrfs_feature_attr, kobj_attr); -} - -static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) -{ - return container_of(attr, struct kobj_attribute, attr); -} - -static inline struct btrfs_feature_attr * -attr_to_btrfs_feature_attr(struct attribute *attr) -{ - return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); -} - char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -extern const char * const btrfs_feature_set_names[FEAT_MAX]; -extern struct kobj_type space_info_ktype; -extern struct kobj_type btrfs_raid_ktype; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -88,7 +22,19 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); + +int __init btrfs_init_sysfs(void); +void __cold btrfs_exit_sysfs(void); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 8a59597f1883..a7aca4141788 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" @@ -14,31 +15,50 @@ #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../block-group.h" static struct vfsmount *test_mnt = NULL; +const char *test_error[] = { + [TEST_ALLOC_FS_INFO] = "cannot allocate fs_info", + [TEST_ALLOC_ROOT] = "cannot allocate root", + [TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer", + [TEST_ALLOC_PATH] = "cannot allocate path", + [TEST_ALLOC_INODE] = "cannot allocate inode", + [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", + [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", +}; + static const struct super_operations btrfs_test_super_ops = { .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_test_destroy_inode, }; -static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) + +static int btrfs_test_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, - NULL, BTRFS_TEST_MAGIC); + struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &btrfs_test_super_ops; + return 0; } static struct file_system_type test_type = { .name = "btrfs_test_fs", - .mount = btrfs_test_mount, + .init_fs_context = btrfs_test_init_fs_context, .kill_sb = kill_anon_super, }; struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) @@ -99,7 +119,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->qgroup_lock); - spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); @@ -115,8 +134,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - extent_io_tree_init(&fs_info->freed_extents[0], NULL); - extent_io_tree_init(&fs_info->freed_extents[1], NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[0], + IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[1], + IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); @@ -181,11 +202,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } -struct btrfs_block_group_cache * +struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) @@ -197,9 +218,8 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, return NULL; } - cache->key.objectid = 0; - cache->key.offset = length; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->start = 0; + cache->length = length; cache->full_stripe_len = fs_info->sectorsize; cache->fs_info = fs_info; @@ -212,7 +232,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, return cache; } -void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) { if (!cache) return; diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 70ff9f9d86a1..9e52527357d8 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -10,7 +10,22 @@ int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) -#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) +#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n", \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define test_std_err(index) test_err("%s", test_error[index]) + +enum { + TEST_ALLOC_FS_INFO, + TEST_ALLOC_ROOT, + TEST_ALLOC_EXTENT_BUFFER, + TEST_ALLOC_PATH, + TEST_ALLOC_INODE, + TEST_ALLOC_BLOCK_GROUP, + TEST_ALLOC_EXTENT_MAP, +}; + +extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; @@ -26,9 +41,9 @@ struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); -struct btrfs_block_group_cache * +struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); -void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); #else diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 7d72eab6d32c..a1b9f9b5978e 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -30,27 +30,27 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("could not allocate fs_info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("could not allocate root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } path = btrfs_alloc_path(); if (!path) { - test_err("could not allocate path"); + test_std_err(TEST_ALLOC_PATH); ret = -ENOMEM; goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); if (!eb) { - test_err("could not allocate dummy buffer"); + test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; goto out; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 3c46d7f23456..123d9a614357 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -10,6 +10,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) #define PROCESS_RELEASE (1 << 1) @@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, static int test_find_delalloc(u32 sectorsize) { struct inode *inode; - struct extent_io_tree tmp; + struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; unsigned long index = 0; @@ -73,11 +74,16 @@ static int test_find_delalloc(u32 sectorsize) inode = btrfs_new_test_inode(); if (!inode) { - test_err("failed to allocate test inode"); + test_std_err(TEST_ALLOC_INODE); return -ENOMEM; } + tmp = &BTRFS_I(inode)->io_tree; - extent_io_tree_init(&tmp, NULL); + /* + * Passing NULL as we don't have fs_info but tracepoints are not used + * at this point + */ + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -104,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); + set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -118,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); unlock_page(locked_page); put_page(locked_page); @@ -135,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -154,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* locked_page was unlocked above */ put_page(locked_page); @@ -172,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -190,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -209,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* * Now to test where we run into a page that is no longer dirty in the @@ -234,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -252,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -374,8 +380,8 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; unsigned long len; - unsigned long *bitmap; - struct extent_buffer *eb; + unsigned long *bitmap = NULL; + struct extent_buffer *eb = NULL; int ret; test_msg("running extent buffer bitmap tests"); @@ -388,18 +394,23 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) ? sectorsize * 4 : sectorsize; fs_info = btrfs_alloc_dummy_fs_info(len, len); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_err("couldn't allocate test bitmap"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } eb = __alloc_dummy_extent_buffer(fs_info, 0, len); if (!eb) { - test_err("couldn't allocate test extent buffer"); - kfree(bitmap); - return -ENOMEM; + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; } ret = __test_eb_bitmaps(bitmap, eb, len); @@ -408,17 +419,118 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); + eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len); if (!eb) { - test_err("couldn't allocate test extent buffer"); - kfree(bitmap); - return -ENOMEM; + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; } ret = __test_eb_bitmaps(bitmap, eb, len); out: free_extent_buffer(eb); kfree(bitmap); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +static int test_find_first_clear_extent_bit(void) +{ + struct extent_io_tree tree; + u64 start, end; + int ret = -EINVAL; + + test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M + */ + set_extent_bits(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != 0 || end != SZ_1M - 1) { + test_err("error finding beginning range: start %llu end %llu", + start, end); + goto out; + } + + /* Now add 32M-64M so that we have a hole between 4M-32M */ + set_extent_bits(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* + * Request first hole starting at 12M, we should get 4M-32M + */ + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) { + test_err("error finding trimmed range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Search in the middle of allocated range, should get the next one + * available, which happens to be unallocated -> 4M-32M + */ + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) { + test_err("error finding next unalloc range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag + * being unset in this range, we should get the entry in range 64M-72M + */ + set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); + + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { + test_err("error finding exact range: start %llu end %llu", + start, end); + goto out; + } + + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); + + /* + * Search in the middle of set range whose immediate neighbour doesn't + * have the bits set so it must be returned + */ + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { + test_err("error finding next alloc range: start %llu end %llu", + start, end); + goto out; + } + + /* + * Search beyond any known range, shall return after last known range + * and end should be -1 + */ + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + if (start != SZ_64M + SZ_8M || end != -1) { + test_err( + "error handling beyond end of range search: start %llu end %llu", + start, end); + goto out; + } + + ret = 0; +out: + clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + return ret; } @@ -432,8 +544,11 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) if (ret) goto out; + ret = test_find_first_clear_extent_bit(); + if (ret) + goto out; + ret = test_eb_bitmaps(sectorsize, nodesize); out: - test_msg("extent I/O tests finished"); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index bf15d3a7f20e..4a7f796c9900 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -47,7 +47,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static void test_case_1(struct btrfs_fs_info *fs_info, +static int test_case_1(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { struct extent_map *em; @@ -56,55 +56,79 @@ static void test_case_1(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip the test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 16K)"); + goto out; + } free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } em->start = SZ_16K; em->len = SZ_4K; em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [16K, 20K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 8K), should return [0, 16K) instead. */ em->start = start; em->len = len; em->block_start = start; em->block_len = len; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); - if (ret) + write_unlock(&em_tree->lock); + if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); + goto out; + } if (em && (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) + em->block_start != 0 || em->block_len != SZ_16K)) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -113,65 +137,89 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static void test_case_2(struct btrfs_fs_info *fs_info, +static int test_case_2(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { struct extent_map *em; int ret; em = alloc_extent_map(); - if (!em) - /* Skip the test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 1K)"); + goto out; + } free_extent_map(em); - /* Add [4K, 4K) following [0, 1K) */ + /* Add [4K, 8K) following [0, 1K) */ em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } em->start = SZ_4K; em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); - if (ret) + write_unlock(&em_tree->lock); + if (ret) { test_err("case2 [0 1K]: ret %d", ret); + goto out; + } if (em && (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } -static void __test_case_3(struct btrfs_fs_info *fs_info, +static int __test_case_3(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; @@ -179,47 +227,63 @@ static void __test_case_3(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip this test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [4K, 8K) */ em->start = SZ_4K; em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); - if (ret) + write_unlock(&em_tree->lock); + if (ret) { test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ if (em && (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) + em->start != em->block_start || em->len != em->block_len)) { test_err( "case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -238,15 +302,23 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static void test_case_3(struct btrfs_fs_info *fs_info, +static int test_case_3(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { - __test_case_3(fs_info, em_tree, 0); - __test_case_3(fs_info, em_tree, SZ_8K); - __test_case_3(fs_info, em_tree, (12 * 1024ULL)); + int ret; + + ret = __test_case_3(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, SZ_8K); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + + return ret; } -static void __test_case_4(struct btrfs_fs_info *fs_info, +static int __test_case_4(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; @@ -254,54 +326,77 @@ static void __test_case_4(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip this test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0K, 8K) */ em->start = 0; em->len = SZ_8K; em->block_start = 0; em->block_len = SZ_8K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [0, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } - /* Add [8K, 24K) */ + /* Add [8K, 32K) */ em->start = SZ_8K; - em->len = 24 * 1024ULL; + em->len = 24 * SZ_1K; em->block_start = SZ_16K; /* avoid merging */ - em->block_len = 24 * 1024ULL; + em->block_len = 24 * SZ_1K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("cannot add extent range [8K, 32K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0K, 32K) */ em->start = 0; em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); - if (ret) + write_unlock(&em_tree->lock); + if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); - if (em && - (start < em->start || start + len > extent_map_end(em))) + goto out; + } + if (em && (start < em->start || start + len > extent_map_end(em))) { test_err( "case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -329,17 +424,24 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static void test_case_4(struct btrfs_fs_info *fs_info, +static int test_case_4(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { - __test_case_4(fs_info, em_tree, 0); - __test_case_4(fs_info, em_tree, SZ_4K); + int ret; + + ret = __test_case_4(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_4(fs_info, em_tree, SZ_4K); + + return ret; } int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; + int ret = 0; test_msg("running extent_map tests"); @@ -349,25 +451,32 @@ int btrfs_test_extent_map(void) */ fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); - if (!em_tree) - /* Skip the test on error. */ + if (!em_tree) { + ret = -ENOMEM; goto out; + } extent_map_tree_init(em_tree); - test_case_1(fs_info, em_tree); - test_case_2(fs_info, em_tree); - test_case_3(fs_info, em_tree); - test_case_4(fs_info, em_tree); + ret = test_case_1(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_2(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_3(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_4(fs_info, em_tree); - kfree(em_tree); out: + kfree(em_tree); btrfs_free_dummy_fs_info(fs_info); - return 0; + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 5c2f77e9439b..aebdf23f0cdd 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../disk-io.h" #include "../free-space-cache.h" +#include "../block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) @@ -16,7 +17,7 @@ * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ -static int test_extents(struct btrfs_block_group_cache *cache) +static int test_extents(struct btrfs_block_group *cache) { int ret = 0; @@ -86,8 +87,7 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache, - u32 sectorsize) +static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -155,7 +155,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, +static int test_bitmaps_and_extents(struct btrfs_block_group *cache, u32 sectorsize) { u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); @@ -330,7 +330,7 @@ static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, /* Used by test_steal_space_from_bitmap_to_extent(). */ static int -check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, +check_num_extents_and_bitmaps(const struct btrfs_block_group *cache, const int num_extents, const int num_bitmaps) { @@ -350,7 +350,7 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, } /* Used by test_steal_space_from_bitmap_to_extent(). */ -static int check_cache_empty(struct btrfs_block_group_cache *cache) +static int check_cache_empty(struct btrfs_block_group *cache) { u64 offset; u64 max_extent_size; @@ -392,7 +392,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, u32 sectorsize) { int ret; @@ -404,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, }; const struct btrfs_free_space_op *orig_free_space_ops; - test_msg("running space stealing from bitmap to extent"); + test_msg("running space stealing from bitmap to extent tests"); /* * For this test, we want to ensure we end up with an extent entry @@ -828,15 +828,16 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; test_msg("running btrfs free space cache tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); - if (!fs_info) + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; - + } /* * For ppc64 (with 64k page size), bytes per bitmap might be @@ -846,13 +847,14 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) cache = btrfs_alloc_dummy_block_group(fs_info, BITS_PER_BITMAP * sectorsize + PAGE_SIZE); if (!cache) { - test_err("couldn't run the tests"); + test_std_err(TEST_ALLOC_BLOCK_GROUP); btrfs_free_dummy_fs_info(fs_info); return 0; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -874,6 +876,5 @@ out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); - test_msg("free space cache tests finished"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 89346da890cf..914eea5ba6a7 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -9,6 +9,7 @@ #include "../disk-io.h" #include "../free-space-tree.h" #include "../transaction.h" +#include "../block-group.h" struct free_space_extent { u64 start; @@ -17,7 +18,7 @@ struct free_space_extent { static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, const struct free_space_extent * const extents, unsigned int num_extents) @@ -30,7 +31,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, unsigned int i; int ret; - info = search_free_space_info(trans, fs_info, cache, path, 0); + info = search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); ret = PTR_ERR(info); @@ -47,7 +48,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { if (path->slots[0] != 0) goto invalid; - end = cache->key.objectid + cache->key.offset; + end = cache->start + cache->length; i = 0; while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -106,7 +107,7 @@ invalid: static int check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, const struct free_space_extent * const extents, unsigned int num_extents) @@ -115,7 +116,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, u32 flags; int ret; - info = search_free_space_info(trans, fs_info, cache, path, 0); + info = search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); btrfs_release_path(path); @@ -149,12 +150,12 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, static int test_empty_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset}, + {cache->start, cache->length}, }; return check_free_space_extents(trans, fs_info, cache, path, @@ -163,7 +164,7 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans, static int test_remove_all(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { @@ -171,8 +172,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans, int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, + cache->length); if (ret) { test_err("could not remove free space"); return ret; @@ -184,18 +185,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans, static int test_remove_beginning(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid + alignment, - cache->key.offset - alignment}, + {cache->start + alignment, cache->length - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, alignment); + cache->start, alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -208,19 +208,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, static int test_remove_end(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset - alignment}, + {cache->start, cache->length - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid + - cache->key.offset - alignment, - alignment); + cache->start + cache->length - alignment, + alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -232,19 +231,18 @@ static int test_remove_end(struct btrfs_trans_handle *trans, static int test_remove_middle(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, alignment}, - {cache->key.objectid + 2 * alignment, - cache->key.offset - 2 * alignment}, + {cache->start, alignment}, + {cache->start + 2 * alignment, cache->length - 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not remove free space"); @@ -257,24 +255,23 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, static int test_merge_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, 2 * alignment}, + {cache->start, 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -282,7 +279,7 @@ static int test_merge_left(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); @@ -295,25 +292,24 @@ static int test_merge_left(struct btrfs_trans_handle *trans, static int test_merge_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid + alignment, 2 * alignment}, + {cache->start + alignment, 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); @@ -321,7 +317,7 @@ static int test_merge_right(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); @@ -334,24 +330,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans, static int test_merge_both(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, 3 * alignment}, + {cache->start, 3 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -359,16 +354,14 @@ static int test_merge_both(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, - alignment); + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, - alignment); + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -380,26 +373,25 @@ static int test_merge_both(struct btrfs_trans_handle *trans, static int test_merge_none(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, alignment}, - {cache->key.objectid + 2 * alignment, alignment}, - {cache->key.objectid + 4 * alignment, alignment}, + {cache->start, alignment}, + {cache->start + 2 * alignment, alignment}, + {cache->start + 4 * alignment, alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -407,16 +399,14 @@ static int test_merge_none(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 4 * alignment, - alignment); + cache->start + 4 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, - alignment); + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -428,7 +418,7 @@ static int test_merge_none(struct btrfs_trans_handle *trans, typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_fs_info *, - struct btrfs_block_group_cache *, + struct btrfs_block_group *, struct btrfs_path *, u32 alignment); @@ -437,21 +427,21 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, { struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); ret = -ENOMEM; goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate dummy root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -462,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { - test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); @@ -473,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); if (!cache) { - test_err("couldn't allocate dummy block group cache"); + test_std_err(TEST_ALLOC_BLOCK_GROUP); ret = -ENOMEM; goto out; } @@ -486,7 +476,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } @@ -539,7 +529,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 0, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } @@ -547,7 +537,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 1, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index af0c8e30d9e2..09ecf7dc7b08 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -226,31 +226,34 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) u64 offset; int ret = -ENOMEM; + test_msg("running btrfs_get_extent tests"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } + inode->i_mode = S_IFREG; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_err("couldn't allocate dummy buffer"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -827,9 +830,11 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) struct extent_map *em = NULL; int ret = -ENOMEM; + test_msg("running hole first btrfs_get_extent test"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } @@ -839,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_err("couldn't allocate dummy buffer"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -927,21 +932,23 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) struct btrfs_root *root = NULL; int ret = -ENOMEM; + test_msg("running outstanding_extents tests"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -950,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, - NULL, 0); + NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -965,7 +972,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -981,8 +988,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -998,7 +1004,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1016,7 +1022,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1033,7 +1039,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1049,8 +1055,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1068,7 +1073,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1082,8 +1087,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1098,8 +1102,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -1110,17 +1113,16 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; + test_msg("running inode tests"); + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); - test_msg("running btrfs_get_extent tests"); ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; - test_msg("running hole first btrfs_get_extent test"); ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; - test_msg("running outstanding_extents tests"); return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 412b910b04cc..ac035a6fa003 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -32,7 +32,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -82,7 +82,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -132,7 +132,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } path->leave_spinning = 1; @@ -166,7 +166,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -215,7 +215,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, btrfs_init_dummy_trans(&trans, fs_info); - test_msg("qgroup basic add"); + test_msg("running qgroup add/remove tests"); ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID); if (ret) { test_err("couldn't create a qgroup %d", ret); @@ -316,7 +316,7 @@ static int test_multiple_refs(struct btrfs_root *root, btrfs_init_dummy_trans(&trans, fs_info); - test_msg("qgroup multiple refs test"); + test_msg("running qgroup multiple refs test"); /* * We have BTRFS_FS_TREE_OBJECTID created already from the @@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); @@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_err("couldn't allocate a fs root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(tmp_root); goto out; } @@ -510,7 +510,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_err("couldn't allocate a fs root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(tmp_root); goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e4e665f422fc..cfc08ef9b876 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -19,24 +20,98 @@ #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" +#include "block-group.h" #define BTRFS_ROOT_TRANS_TAG 0 +/* + * Transaction states and transitions + * + * No running transaction (fs tree blocks are not modified) + * | + * | To next stage: + * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). + * V + * Transaction N [[TRANS_STATE_RUNNING]] + * | + * | New trans handles can be attached to transaction N by calling all + * | start_transaction() variants. + * | + * | To next stage: + * | Call btrfs_commit_transaction() on any trans handle attached to + * | transaction N + * V + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Will wait for previous running transaction to completely finish if there + * | is one + * | + * | Then one of the following happes: + * | - Wait for all other trans handle holders to release. + * | The btrfs_commit_transaction() caller will do the commit work. + * | - Wait for current transaction to be committed by others. + * | Other btrfs_commit_transaction() caller will do the commit work. + * | + * | At this stage, only btrfs_join_transaction*() variants can attach + * | to this running transaction. + * | All other variants will wait for current one to finish and attach to + * | transaction N+1. + * | + * | To next stage: + * | Caller is chosen to commit transaction N, and all other trans handle + * | haven been released. + * V + * Transaction N [[TRANS_STATE_COMMIT_DOING]] + * | + * | The heavy lifting transaction work is started. + * | From running delayed refs (modifying extent tree) to creating pending + * | snapshots, running qgroups. + * | In short, modify supporting trees to reflect modifications of subvolume + * | trees. + * | + * | At this stage, all start_transaction() calls will wait for this + * | transaction to finish and attach to transaction N+1. + * | + * | To next stage: + * | Until all supporting trees are updated. + * V + * Transaction N [[TRANS_STATE_UNBLOCKED]] + * | Transaction N+1 + * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] + * | need to write them back to disk and update | + * | super blocks. | + * | | + * | At this stage, new transaction is allowed to | + * | start. | + * | All new start_transaction() calls will be | + * | attached to transid N+1. | + * | | + * | To next stage: | + * | Until all tree blocks are super blocks are | + * | written to block devices | + * V | + * Transaction N [[TRANS_STATE_COMPLETED]] V + * All tree blocks and super blocks are written. Transaction N+1 + * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] + * data structures will be cleaned up. | Life goes on + */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, - [TRANS_STATE_BLOCKED] = __TRANS_START, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | - __TRANS_JOIN), + __TRANS_JOIN | + __TRANS_JOIN_NOSTART), [TRANS_STATE_UNBLOCKED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | - __TRANS_JOIN_NOLOCK), + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | - __TRANS_JOIN_NOLOCK), + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), }; void btrfs_put_transaction(struct btrfs_transaction *transaction) @@ -50,14 +125,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_err(transaction->fs_info, "pending csums is %llu", transaction->delayed_refs.pending_csums); - while (!list_empty(&transaction->pending_chunks)) { - struct extent_map *em; - - em = list_first_entry(&transaction->pending_chunks, - struct extent_map, list); - list_del_init(&em->list); - free_extent_map(em); - } /* * If any block groups are found in ->deleted_bgs then it's * because the transaction was aborted and a commit did not @@ -66,48 +133,20 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) * discard the physical locations of the block groups. */ while (!list_empty(&transaction->deleted_bgs)) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = list_first_entry(&transaction->deleted_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, bg_list); list_del_init(&cache->bg_list); btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } + WARN_ON(!list_empty(&transaction->dev_update_list)); kfree(transaction); } } -static void clear_btree_io_tree(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once clear_btree_io_tree is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - static noinline void switch_commit_roots(struct btrfs_transaction *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -121,7 +160,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) root->commit_root = btrfs_root_node(root); if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); - clear_btree_io_tree(&root->dirty_log_pages); + extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -165,6 +204,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) } /* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + +/* * either allocate a new transaction or hop into the existing one */ static noinline int join_transaction(struct btrfs_fs_info *fs_info, @@ -263,19 +320,18 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->dev_update_list); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); - cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode); + extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -397,7 +453,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, static inline int is_transaction_blocked(struct btrfs_transaction *trans) { - return (trans->state >= TRANS_STATE_BLOCKED && + return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && !trans->aborted); } @@ -500,7 +556,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * worth of delayed refs updates in this trans handle, and * refill that amount for whatever is missing in the reserve. */ - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (delayed_refs_rsv->full == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; @@ -562,7 +618,8 @@ again: ret = join_transaction(fs_info, type); if (ret == -EBUSY) { wait_current_trans(fs_info); - if (unlikely(type == TRANS_ATTACH)) + if (unlikely(type == TRANS_ATTACH || + type == TRANS_JOIN_NOSTART)) ret = -ENOENT; } } while (ret == -EBUSY); @@ -583,7 +640,7 @@ again: INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED && + if (cur_trans->state >= TRANS_STATE_COMMIT_START && may_wait_transaction(fs_info, type)) { current->journal_info = h; btrfs_commit_transaction(h); @@ -650,7 +707,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( if (IS_ERR(trans)) return trans; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, num_bytes, min_factor); if (ret) { @@ -672,13 +729,23 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) true); } -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK, BTRFS_RESERVE_NO_FLUSH, true); } /* + * Similar to regular join but it never starts a transaction when none is + * running or after waiting for the current one to finish. + */ +struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_NOSTART, + BTRFS_RESERVE_NO_FLUSH, true); +} + +/* * btrfs_attach_transaction() - catch the running transaction * * It is used when we want to commit the current the transaction, but @@ -801,7 +868,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED || + if (cur_trans->state >= TRANS_STATE_COMMIT_START || cur_trans->delayed_refs.flushing) return 1; @@ -834,7 +901,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; if (refcount_read(&trans->use_count) > 1) { @@ -850,13 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_chunk_metadata(trans); - if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) - return btrfs_commit_transaction(trans); - else - wake_up_process(info->transaction_kthread); - } - if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); @@ -928,7 +987,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * superblock that points to btree nodes/leafs for which * writeback hasn't finished yet (and without errors). * We cleanup any entries left in the io tree when committing - * the transaction (through clear_btree_io_tree()). + * the transaction (through extent_io_tree_release()). */ if (err == -ENOMEM) { err = 0; @@ -973,7 +1032,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * left in the io tree. For a log commit, we don't remove them * after committing the log because the tree can be accessed * concurrently - we do it only at transaction commit time when - * it's safe to do it (through clear_btree_io_tree()). + * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 0, 0, &cached_state); @@ -993,7 +1052,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, return werr; } -int btrfs_wait_extents(struct btrfs_fs_info *fs_info, +static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; @@ -1051,7 +1110,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - clear_btree_io_tree(&trans->transaction->dirty_pages); + extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1130,17 +1189,17 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - ret = btrfs_run_dev_stats(trans, fs_info); + ret = btrfs_run_dev_stats(trans); if (ret) return ret; - ret = btrfs_run_dev_replace(trans, fs_info); + ret = btrfs_run_dev_replace(trans); if (ret) return ret; ret = btrfs_run_qgroups(trans); if (ret) return ret; - ret = btrfs_setup_space_cache(trans, fs_info); + ret = btrfs_setup_space_cache(trans); if (ret) return ret; @@ -1168,7 +1227,7 @@ again: } while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { - ret = btrfs_write_dirty_block_groups(trans, fs_info); + ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); @@ -1878,7 +1937,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -1952,6 +2011,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *prev_trans = NULL; int ret; + ASSERT(refcount_read(&trans->use_count) == 1); + /* Stop the commit early if ->aborted is set */ if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -2056,6 +2117,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } } else { spin_unlock(&fs_info->trans_lock); + /* + * The previous transaction was aborted and was already removed + * from the list of transactions at fs_info->trans_list. So we + * abort to prevent writing a new superblock that reflects a + * corrupt state (pointing to trees with unwritten nodes/leafs). + */ + if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) { + ret = -EROFS; + goto cleanup_transaction; + } } extwriter_counter_dec(cur_trans, trans->type); @@ -2241,8 +2312,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_copy)); - btrfs_update_commit_device_size(fs_info); - btrfs_update_commit_device_bytes_used(cur_trans); + btrfs_commit_device_sizes(cur_trans); clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f1ba78949d1b..49f7196368f5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -13,7 +13,6 @@ enum btrfs_trans_state { TRANS_STATE_RUNNING, - TRANS_STATE_BLOCKED, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, @@ -51,7 +50,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head pending_chunks; + struct list_head dev_update_list; struct list_head switch_commits; struct list_head dirty_bgs; @@ -80,7 +79,6 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; - unsigned int num_dirty_bgs; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; @@ -95,11 +93,13 @@ struct btrfs_transaction { #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) #define __TRANS_DUMMY (1U << 13) +#define __TRANS_JOIN_NOSTART (1U << 14) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) #define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) #define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) +#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART) #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) @@ -120,7 +120,6 @@ struct btrfs_trans_handle { bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; - bool sync; bool dirty; struct btrfs_root *root; struct btrfs_fs_info *fs_info; @@ -184,7 +183,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( unsigned int num_items, int min_factor); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); @@ -217,8 +217,6 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); -int btrfs_wait_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); @@ -226,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a62e1e837a89..97f3520b8d98 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -15,11 +15,15 @@ * carefully reviewed otherwise so it does not prevent mount of valid images. */ +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/error-injection.h> #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" #include "compression.h" #include "volumes.h" +#include "misc.h" /* * Error message should follow the following format: @@ -41,12 +45,12 @@ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. * Allows callers to customize the output. */ -__printf(4, 5) +__printf(3, 4) __cold -static void generic_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void generic_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct va_format vaf; va_list args; @@ -66,12 +70,12 @@ static void generic_err(const struct btrfs_fs_info *fs_info, * Customized reporter for extent data item, since its key objectid and * offset has its own meaning. */ -__printf(4, 5) +__printf(3, 4) __cold -static void file_extent_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void file_extent_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; @@ -94,38 +98,145 @@ static void file_extent_err(const struct btrfs_fs_info *fs_info, * Return 0 if the btrfs_file_extent_##name is aligned to @alignment * Else return 1 */ -#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \ +#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ ({ \ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ - file_extent_err((fs_info), (leaf), (slot), \ + file_extent_err((leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ (alignment)); \ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ }) -static int check_extent_data_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_key *key, int slot) +static u64 file_extent_end(struct extent_buffer *leaf, + struct btrfs_key *key, + struct btrfs_file_extent_item *extent) +{ + u64 end; + u64 len; + + if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_ram_bytes(leaf, extent); + end = ALIGN(key->offset + len, leaf->fs_info->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(leaf, extent); + end = key->offset + len; + } + return end; +} + +/* + * Customized report for dir_item, the only new important information is + * key->objectid, which represents inode number + */ +__printf(3, 4) +__cold +static void dir_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +/* + * This functions checks prev_key->objectid, to ensure current key and prev_key + * share the same objectid as inode number. + * + * This is to detect missing INODE_ITEM in subvolume trees. + * + * Return true if everything is OK or we don't need to check. + * Return false if anything is wrong. + */ +static bool check_prev_ino(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + /* No prev key, skip check */ + if (slot == 0) + return true; + + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); + + /* + * Only subvolume trees along with their reloc trees need this check. + * Things like log tree doesn't follow this ino requirement. + */ + if (!is_fstree(btrfs_header_owner(leaf))) + return true; + + if (key->objectid == prev_key->objectid) + return true; + + /* Error found */ + dir_item_err(leaf, slot, + "invalid previous key objectid, have %llu expect %llu", + prev_key->objectid, key->objectid); + return false; +} +static int check_extent_data_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 extent_end; if (!IS_ALIGNED(key->offset, sectorsize)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } + /* + * Previous key must have the same key->objectid (ino). + * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA. + * But if objectids mismatch, it means we have a missing + * INODE_ITEM. + */ + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { - file_extent_err(fs_info, leaf, slot, + /* + * Make sure the item contains at least inline header, so the file + * extent type is not some garbage. + */ + if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { + file_extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, + SZ_4K); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) { + file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), - BTRFS_FILE_EXTENT_TYPES); + BTRFS_NR_FILE_EXTENT_TYPES - 1); return -EUCLEAN; } @@ -133,15 +244,15 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { - file_extent_err(fs_info, leaf, slot, + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { + file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), - BTRFS_COMPRESS_TYPES); + BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } if (btrfs_file_extent_encryption(leaf, fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); return -EUCLEAN; @@ -149,7 +260,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ if (key->offset) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); return -EUCLEAN; @@ -163,7 +274,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, /* Uncompressed inline extent size must match item size */ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)); @@ -174,84 +285,105 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, /* Regular or preallocated extent has fixed item size */ if (item_size != sizeof(*fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } - if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize)) + if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) + return -EUCLEAN; + + /* Catch extent end overflow */ + if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end)) { + file_extent_err(leaf, slot, + "extent end overflow, have file offset %llu extent num bytes %llu", + key->offset, + btrfs_file_extent_num_bytes(leaf, fi)); return -EUCLEAN; + } + + /* + * Check that no two consecutive file extent items, in the same leaf, + * present ranges that overlap each other. + */ + if (slot > 0 && + prev_key->objectid == key->objectid && + prev_key->type == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *prev_fi; + u64 prev_end; + + prev_fi = btrfs_item_ptr(leaf, slot - 1, + struct btrfs_file_extent_item); + prev_end = file_extent_end(leaf, prev_key, prev_fi); + if (prev_end > key->offset) { + file_extent_err(leaf, slot - 1, +"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent", + prev_end, key->offset); + return -EUCLEAN; + } + } + return 0; } -static int check_csum_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, struct btrfs_key *key, - int slot) +static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot, struct btrfs_key *prev_key) { + struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } if (!IS_ALIGNED(key->offset, sectorsize)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (prev_csum_end > key->offset) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } return 0; } -/* - * Customized reported for dir_item, only important new info is key->objectid, - * which represents inode number - */ -__printf(4, 5) -__cold -static void dir_item_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, - const char *fmt, ...) -{ - struct btrfs_key key; - struct va_format vaf; - va_list args; - - btrfs_item_key_to_cpu(eb, &key, slot); - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - btrfs_crit(fs_info, - "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", - btrfs_header_level(eb) == 0 ? "leaf" : "node", - btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, - key.objectid, &vaf); - va_end(args); -} - -static int check_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_key *key, int slot) +static int check_dir_item(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { u32 name_len; @@ -263,7 +395,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); return -EUCLEAN; @@ -272,7 +404,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; @@ -280,14 +412,14 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, if (key->type == BTRFS_XATTR_ITEM_KEY && dir_type != BTRFS_FT_XATTR) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR && key->type != BTRFS_XATTR_ITEM_KEY) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; } @@ -300,13 +432,13 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); if (name_len > max_name_len) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, BTRFS_MAX_XATTR_SIZE(fs_info)); @@ -314,7 +446,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, } if (data_len && dir_type != BTRFS_FT_XATTR) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); return -EUCLEAN; @@ -324,7 +456,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* header and name/data should not cross item boundary */ if (cur + total_size > item_size) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); return -EUCLEAN; @@ -342,7 +474,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); if (key->offset != name_hash) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); return -EUCLEAN; @@ -354,12 +486,12 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, return 0; } -__printf(4, 5) +__printf(3, 4) __cold -static void block_group_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void block_group_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; @@ -378,8 +510,7 @@ static void block_group_err(const struct btrfs_fs_info *fs_info, va_end(args); } -static int check_block_group_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; @@ -392,13 +523,13 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, * handle it. We care more about the size. */ if (key->offset == 0) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid block group size 0"); return -EUCLEAN; } if (item_size != sizeof(bgi)) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); return -EUCLEAN; @@ -406,25 +537,25 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (btrfs_block_group_chunk_objectid(&bgi) != + if (btrfs_stack_block_group_chunk_objectid(&bgi) != BTRFS_FIRST_CHUNK_TREE_OBJECTID) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", - btrfs_block_group_chunk_objectid(&bgi), + btrfs_stack_block_group_chunk_objectid(&bgi), BTRFS_FIRST_CHUNK_TREE_OBJECTID); return -EUCLEAN; } - if (btrfs_block_group_used(&bgi) > key->offset) { - block_group_err(fs_info, leaf, slot, + if (btrfs_stack_block_group_used(&bgi) > key->offset) { + block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", - btrfs_block_group_used(&bgi), key->offset); + btrfs_stack_block_group_used(&bgi), key->offset); return -EUCLEAN; } - flags = btrfs_block_group_flags(&bgi); + flags = btrfs_stack_block_group_flags(&bgi); if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); @@ -437,7 +568,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, type != BTRFS_BLOCK_GROUP_SYSTEM && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, @@ -448,37 +579,841 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, return 0; } +__printf(4, 5) +__cold +static void chunk_err(const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = leaf->fs_info; + bool is_sb; + struct va_format vaf; + va_list args; + int i; + int slot = -1; + + /* Only superblock eb is able to have such small offset */ + is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); + + if (!is_sb) { + /* + * Get the slot number by iterating through all slots, this + * would provide better readability. + */ + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + if (btrfs_item_ptr_offset(leaf, i) == + (unsigned long)chunk) { + slot = i; + break; + } + } + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (is_sb) + btrfs_crit(fs_info, + "corrupt superblock syschunk array: chunk_start=%llu, %pV", + logical, &vaf); + else + btrfs_crit(fs_info, + "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", + BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, + logical, &vaf); + va_end(args); +} + +/* + * The common chunk check which could also work on super block sys chunk array. + * + * Return -EUCLEAN if anything is corrupted. + * Return 0 if everything is OK. + */ +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + u64 length; + u64 stripe_len; + u16 num_stripes; + u16 sub_stripes; + u64 type; + u64 features; + bool mixed = false; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + + if (!num_stripes) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); + return -EUCLEAN; + } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", + logical, fs_info->sectorsize); + return -EUCLEAN; + } + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + chunk_err(leaf, chunk, logical, + "invalid chunk sectorsize, have %u expect %u", + btrfs_chunk_sector_size(leaf, chunk), + fs_info->sectorsize); + return -EUCLEAN; + } + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk length, have %llu", length); + return -EUCLEAN; + } + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + chunk_err(leaf, chunk, logical, + "invalid chunk stripe length: %llu", + stripe_len); + return -EUCLEAN; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { + chunk_err(leaf, chunk, logical, + "unrecognized chunk type: 0x%llx", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EUCLEAN; + } + + if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + chunk_err(leaf, chunk, logical, + "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + chunk_err(leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", + type, BTRFS_BLOCK_GROUP_TYPE_MASK); + return -EUCLEAN; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + chunk_err(leaf, chunk, logical, + "system chunk with data or metadata type: 0x%llx", + type); + return -EUCLEAN; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { + chunk_err(leaf, chunk, logical, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EUCLEAN; + } + } + + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { + chunk_err(leaf, chunk, logical, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + + return 0; +} + +__printf(3, 4) +__cold +static void dev_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +static int check_dev_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dev_item *ditem; + + if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + dev_item_err(leaf, slot, + "invalid objectid: has=%llu expect=%llu", + key->objectid, BTRFS_DEV_ITEMS_OBJECTID); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); + if (btrfs_device_id(leaf, ditem) != key->offset) { + dev_item_err(leaf, slot, + "devid mismatch: key has=%llu item has=%llu", + key->offset, btrfs_device_id(leaf, ditem)); + return -EUCLEAN; + } + + /* + * For device total_bytes, we don't have reliable way to check it, as + * it can be 0 for device removal. Device size check can only be done + * by dev extents check. + */ + if (btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem)) { + dev_item_err(leaf, slot, + "invalid bytes used: have %llu expect [0, %llu]", + btrfs_device_bytes_used(leaf, ditem), + btrfs_device_total_bytes(leaf, ditem)); + return -EUCLEAN; + } + /* + * Remaining members like io_align/type/gen/dev_group aren't really + * utilized. Skip them to make later usage of them easier. + */ + return 0; +} + +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(fs_info, eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_inode_item *iitem; + u64 super_gen = btrfs_super_generation(fs_info->super_copy); + u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + u32 mode; + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + return -EUCLEAN; + } + if (key->offset != 0) { + inode_item_err(fs_info, leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); + + /* Here we use super block generation + 1 to handle log tree */ + if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect (0, %llu]", + btrfs_inode_generation(leaf, iitem), + super_gen + 1); + return -EUCLEAN; + } + /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ + if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect [0, %llu]", + btrfs_inode_transid(leaf, iitem), super_gen + 1); + return -EUCLEAN; + } + + /* + * For size and nbytes it's better not to be too strict, as for dir + * item its size/nbytes can easily get wrong, but doesn't affect + * anything in the fs. So here we skip the check. + */ + mode = btrfs_inode_mode(leaf, iitem); + if (mode & ~valid_mask) { + inode_item_err(fs_info, leaf, slot, + "unknown mode bit detected: 0x%x", + mode & ~valid_mask); + return -EUCLEAN; + } + + /* + * S_IFMT is not bit mapped so we can't completely rely on + * is_power_of_2/has_single_bit_set, but it can save us from checking + * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS + */ + if (!has_single_bit_set(mode & S_IFMT)) { + if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { + inode_item_err(fs_info, leaf, slot, + "invalid mode: has 0%o expect valid S_IF* bit(s)", + mode & S_IFMT); + return -EUCLEAN; + } + } + if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { + inode_item_err(fs_info, leaf, slot, + "invalid nlink: has %u expect no more than 1 for dir", + btrfs_inode_nlink(leaf, iitem)); + return -EUCLEAN; + } + if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { + inode_item_err(fs_info, leaf, slot, + "unknown flags detected: 0x%llx", + btrfs_inode_flags(leaf, iitem) & + ~BTRFS_INODE_FLAG_MASK); + return -EUCLEAN; + } + return 0; +} + +static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_root_item ri; + const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | + BTRFS_ROOT_SUBVOL_DEAD; + + /* No such tree id */ + if (key->objectid == 0) { + generic_err(leaf, slot, "invalid root id 0"); + return -EUCLEAN; + } + + /* + * Some older kernel may create ROOT_ITEM with non-zero offset, so here + * we only check offset for reloc tree whose key->offset must be a + * valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + generic_err(leaf, slot, + "invalid root item size, have %u expect %zu", + btrfs_item_size_nr(leaf, slot), sizeof(ri)); + } + + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), + sizeof(ri)); + + /* Generation related */ + if (btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root generation, have %llu expect (0, %llu]", + btrfs_root_generation(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root v2 generation, have %llu expect (0, %llu]", + btrfs_root_generation_v2(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root last_snapshot, have %llu expect (0, %llu]", + btrfs_root_last_snapshot(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + + /* Alignment and level check */ + if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid root bytenr, have %llu expect to be aligned to %u", + btrfs_root_bytenr(&ri), fs_info->sectorsize); + return -EUCLEAN; + } + if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + if (ri.drop_level >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + ri.drop_level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* Flags check */ + if (btrfs_root_flags(&ri) & ~valid_root_flags) { + generic_err(leaf, slot, + "invalid root flags, have 0x%llx expect mask 0x%llx", + btrfs_root_flags(&ri), valid_root_flags); + return -EUCLEAN; + } + return 0; +} + +__printf(3,4) +__cold +static void extent_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + u64 bytenr; + u64 len; + + btrfs_item_key_to_cpu(eb, &key, slot); + bytenr = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_TREE_BLOCK_REF_KEY || + key.type == BTRFS_SHARED_BLOCK_REF_KEY) + len = eb->fs_info->nodesize; + else + len = key.offset; + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + eb->start, slot, bytenr, len, &vaf); + va_end(args); +} + +static int check_extent_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_extent_item *ei; + bool is_tree_block = false; + unsigned long ptr; /* Current pointer inside inline refs */ + unsigned long end; /* Extent item end */ + const u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 flags; + u64 generation; + u64 total_refs; /* Total refs in btrfs_extent_item */ + u64 inline_refs = 0; /* found total inline refs */ + + if (key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + generic_err(leaf, slot, +"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); + return -EUCLEAN; + } + /* key->objectid is the bytenr for both key types */ + if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid key objectid, have %llu expect to be aligned to %u", + key->objectid, fs_info->sectorsize); + return -EUCLEAN; + } + + /* key->offset is tree level for METADATA_ITEM_KEY */ + if (key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree level, have %llu expect [0, %u]", + key->offset, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* + * EXTENT/METADATA_ITEM consists of: + * 1) One btrfs_extent_item + * Records the total refs, type and generation of the extent. + * + * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only) + * Records the first key and level of the tree block. + * + * 2) Zero or more btrfs_extent_inline_ref(s) + * Each inline ref has one btrfs_extent_inline_ref shows: + * 2.1) The ref type, one of the 4 + * TREE_BLOCK_REF Tree block only + * SHARED_BLOCK_REF Tree block only + * EXTENT_DATA_REF Data only + * SHARED_DATA_REF Data only + * 2.2) Ref type specific data + * Either using btrfs_extent_inline_ref::offset, or specific + * data structure. + */ + if (item_size < sizeof(*ei)) { + extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, sizeof(*ei), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + end = item_size + btrfs_item_ptr_offset(leaf, slot); + + /* Checks against extent_item */ + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + total_refs = btrfs_extent_refs(leaf, ei); + generation = btrfs_extent_generation(leaf, ei); + if (generation > btrfs_super_generation(fs_info->super_copy) + 1) { + extent_err(leaf, slot, + "invalid generation, have %llu expect (0, %llu]", + generation, + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + extent_err(leaf, slot, + "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", + flags, BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK); + return -EUCLEAN; + } + is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); + if (is_tree_block) { + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect %u", + key->offset, fs_info->nodesize); + return -EUCLEAN; + } + } else { + if (key->type != BTRFS_EXTENT_ITEM_KEY) { + extent_err(leaf, slot, + "invalid key type, have %u expect %u for data backref", + key->type, BTRFS_EXTENT_ITEM_KEY); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect aligned to %u", + key->offset, fs_info->sectorsize); + return -EUCLEAN; + } + } + ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); + + /* Check the special case of btrfs_tree_block_info */ + if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree block info level, have %u expect [0, %u]", + btrfs_tree_block_level(leaf, info), + BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1); + } + + /* Check inline refs */ + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u64 dref_offset; + u64 inline_offset; + u8 inline_type; + + if (ptr + sizeof(*iref) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %zu end %lu", + ptr, sizeof(*iref), end); + return -EUCLEAN; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + inline_type = btrfs_extent_inline_ref_type(leaf, iref); + inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %u end %lu", + ptr, inline_type, end); + return -EUCLEAN; + } + + switch (inline_type) { + /* inline_offset is subvolid of the owner, no need to check */ + case BTRFS_TREE_BLOCK_REF_KEY: + inline_refs++; + break; + /* Contains parent bytenr */ + case BTRFS_SHARED_BLOCK_REF_KEY: + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs++; + break; + /* + * Contains owner subvolid, owner key objectid, adjusted offset. + * The only obvious corruption can happen in that offset. + */ + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data ref offset, have %llu expect aligned to %u", + dref_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_extent_data_ref_count(leaf, dref); + break; + /* Contains parent bytenr and ref count */ + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_shared_data_ref_count(leaf, sref); + break; + default: + extent_err(leaf, slot, "unknown inline ref type: %u", + inline_type); + return -EUCLEAN; + } + ptr += btrfs_extent_inline_ref_size(inline_type); + } + /* No padding is allowed */ + if (ptr != end) { + extent_err(leaf, slot, + "invalid extent item size, padding bytes found"); + return -EUCLEAN; + } + + /* Finally, check the inline refs against total refs */ + if (inline_refs > total_refs) { + extent_err(leaf, slot, + "invalid extent refs, have %llu expect >= inline %llu", + total_refs, inline_refs); + return -EUCLEAN; + } + return 0; +} + +static int check_simple_keyed_refs(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 expect_item_size = 0; + + if (key->type == BTRFS_SHARED_DATA_REF_KEY) + expect_item_size = sizeof(struct btrfs_shared_data_ref); + + if (btrfs_item_size_nr(leaf, slot) != expect_item_size) { + generic_err(leaf, slot, + "invalid item size, have %u expect %u for key type %u", + btrfs_item_size_nr(leaf, slot), + expect_item_size, key->type); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + if (key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + key->offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + return 0; +} + +static int check_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_extent_data_ref *dref; + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + + if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) { + generic_err(leaf, slot, + "invalid item size, have %u expect aligned to %zu for key type %u", + btrfs_item_size_nr(leaf, slot), + sizeof(*dref), key->type); + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + for (; ptr < end; ptr += sizeof(*dref)) { + u64 root_objectid; + u64 owner; + u64 offset; + u64 hash; + + dref = (struct btrfs_extent_data_ref *)ptr; + root_objectid = btrfs_extent_data_ref_root(leaf, dref); + owner = btrfs_extent_data_ref_objectid(leaf, dref); + offset = btrfs_extent_data_ref_offset(leaf, dref); + hash = hash_extent_data_ref(root_objectid, owner, offset); + if (hash != key->offset) { + extent_err(leaf, slot, + "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", + hash, key->offset); + return -EUCLEAN; + } + if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent data backref offset, have %llu expect aligned to %u", + offset, leaf->fs_info->sectorsize); + } + } + return 0; +} + +#define inode_ref_err(fs_info, eb, slot, fmt, args...) \ + inode_item_err(fs_info, eb, slot, fmt, ##args) +static int check_inode_ref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + struct btrfs_inode_ref *iref; + unsigned long ptr; + unsigned long end; + + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; + /* namelen can't be 0, so item_size == sizeof() is also invalid */ + if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { + inode_ref_err(fs_info, leaf, slot, + "invalid item size, have %u expect (%zu, %u)", + btrfs_item_size_nr(leaf, slot), + sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + ptr = btrfs_item_ptr_offset(leaf, slot); + end = ptr + btrfs_item_size_nr(leaf, slot); + while (ptr < end) { + u16 namelen; + + if (ptr + sizeof(iref) > end) { + inode_ref_err(fs_info, leaf, slot, + "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", + ptr, end, sizeof(iref)); + return -EUCLEAN; + } + + iref = (struct btrfs_inode_ref *)ptr; + namelen = btrfs_inode_ref_name_len(leaf, iref); + if (ptr + sizeof(*iref) + namelen > end) { + inode_ref_err(fs_info, leaf, slot, + "inode ref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + + /* + * NOTE: In theory we should record all found index numbers + * to find any duplicated indexes, but that will be too time + * consuming for inodes with too many hard links. + */ + ptr += sizeof(*iref) + namelen; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_key *key, int slot) +static int check_leaf_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) { int ret = 0; + struct btrfs_chunk *chunk; switch (key->type) { case BTRFS_EXTENT_DATA_KEY: - ret = check_extent_data_item(fs_info, leaf, key, slot); + ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(fs_info, leaf, key, slot); + ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: - ret = check_dir_item(fs_info, leaf, key, slot); + ret = check_dir_item(leaf, key, prev_key, slot); + break; + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: - ret = check_block_group_item(fs_info, leaf, key, slot); + ret = check_block_group_item(leaf, key, slot); + break; + case BTRFS_CHUNK_ITEM_KEY: + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = btrfs_check_chunk_valid(leaf, chunk, key->offset); + break; + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(leaf, key, slot); + break; + case BTRFS_INODE_ITEM_KEY: + ret = check_inode_item(leaf, key, slot); + break; + case BTRFS_ROOT_ITEM_KEY: + ret = check_root_item(leaf, key, slot); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + ret = check_extent_item(leaf, key, slot); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = check_simple_keyed_refs(leaf, key, slot); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + ret = check_extent_data_ref(leaf, key, slot); break; } return ret; } -static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, - bool check_item_data) +static int check_leaf(struct extent_buffer *leaf, bool check_item_data) { + struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ struct btrfs_key prev_key = {0, 0, 0}; struct btrfs_key key; @@ -486,7 +1421,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot; if (btrfs_header_level(leaf) != 0) { - generic_err(fs_info, leaf, 0, + generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); return -EUCLEAN; @@ -502,7 +1437,6 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, */ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { u64 owner = btrfs_header_owner(leaf); - struct btrfs_root *check_root; /* These trees must never be empty */ if (owner == BTRFS_ROOT_TREE_OBJECTID || @@ -511,33 +1445,16 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { - generic_err(fs_info, leaf, 0, + generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return -EUCLEAN; } - key.objectid = owner; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - generic_err(fs_info, leaf, 0, - "invalid nritems, have %u should not be 0 for non-root leaf", - nritems); - free_extent_buffer(eb); - return -EUCLEAN; - } - free_extent_buffer(eb); + /* Unknown tree */ + if (owner == 0) { + generic_err(leaf, 0, + "invalid owner, root 0 is not defined"); + return -EUCLEAN; } return 0; } @@ -564,7 +1481,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, /* Make sure the keys are in the right order */ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, @@ -583,7 +1500,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), item_end_expected); @@ -597,7 +1514,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, */ if (btrfs_item_end_nr(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); @@ -607,7 +1524,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, /* Also check if the item pointer overlaps with btrfs item. */ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > btrfs_item_ptr_offset(leaf, slot)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item), @@ -620,7 +1537,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, * Check if the item size and content meet other * criteria */ - ret = check_leaf_item(fs_info, leaf, &key, slot); + ret = check_leaf_item(leaf, &key, slot, &prev_key); if (ret < 0) return ret; } @@ -633,20 +1550,20 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, return 0; } -int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +int btrfs_check_leaf_full(struct extent_buffer *leaf) { - return check_leaf(fs_info, leaf, true); + return check_leaf(leaf, true); } +ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO); -int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +int btrfs_check_leaf_relaxed(struct extent_buffer *leaf) { - return check_leaf(fs_info, leaf, false); + return check_leaf(leaf, false); } -int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) +int btrfs_check_node(struct extent_buffer *node) { + struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; @@ -655,7 +1572,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) int ret = 0; if (level <= 0 || level >= BTRFS_MAX_LEVEL) { - generic_err(fs_info, node, 0, + generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; @@ -675,13 +1592,13 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) btrfs_node_key_to_cpu(node, &next_key, slot + 1); if (!bytenr) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); ret = -EUCLEAN; @@ -689,7 +1606,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) } if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, next_key.objectid, next_key.type, @@ -701,3 +1618,4 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) out: return ret; } +ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ff043275b784..32fecc9dc1dd 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -14,15 +14,16 @@ * Will check not only the item pointers, but also every possible member * in item data. */ -int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); +int btrfs_check_leaf_full(struct extent_buffer *leaf); /* * Less strict leaf checker. * Will only check item pointers, not reading item data. */ -int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); -int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node); +int btrfs_check_leaf_relaxed(struct extent_buffer *leaf); +int btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 561884f60d35..d3f115909ff0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> +#include "misc.h" #include "ctree.h" #include "tree-log.h" #include "disk-io.h" @@ -24,10 +25,12 @@ * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ -#define LOG_INODE_ALL 0 -#define LOG_INODE_EXISTS 1 -#define LOG_OTHER_INODE 2 -#define LOG_OTHER_INODE_ALL 3 +enum { + LOG_INODE_ALL, + LOG_INODE_EXISTS, + LOG_OTHER_INODE, + LOG_OTHER_INODE_ALL, +}; /* * directory trouble cases @@ -81,10 +84,12 @@ * The last stage is to deal with directories and links and extents * and all the other fun semantics */ -#define LOG_WALK_PIN_ONLY 0 -#define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_DIR_INDEX 2 -#define LOG_WALK_REPLAY_ALL 3 +enum { + LOG_WALK_PIN_ONLY, + LOG_WALK_REPLAY_INODES, + LOG_WALK_REPLAY_DIR_INDEX, + LOG_WALK_REPLAY_ALL, +}; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -139,7 +144,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } @@ -188,10 +193,6 @@ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; - smp_mb(); - if (!root->log_root) - return -ENOENT; - mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; @@ -225,6 +226,17 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } +static int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + buf->start + buf->len - 1); +} + +static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); +} /* * the walk control struct is used to pass state down the chain when @@ -304,7 +316,7 @@ static int process_one_buffer(struct btrfs_root *log, if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) - ret = btrfs_exclude_logged_extents(fs_info, eb); + ret = btrfs_exclude_logged_extents(eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -333,7 +345,6 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; u32 item_size; u64 saved_i_size = 0; @@ -454,10 +465,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(fs_info, path, - item_size - found_size); + btrfs_extend_item(path, item_size - found_size); } else if (ret) { return ret; } @@ -496,7 +506,7 @@ insert: ino_size != 0) { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, dst_eb); btrfs_set_token_inode_size(dst_eb, dst_item, ino_size, &token); } @@ -549,7 +559,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + inode = btrfs_iget(root->fs_info->sb, &key, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -694,9 +704,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { + struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); + /* * is this extent already allocated in the extent * allocation tree? If so, just add a reference @@ -704,10 +716,13 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, - ins.objectid, ins.offset, - 0, root->root_key.objectid, + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + ins.objectid, ins.offset, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, key->objectid, offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; } else { @@ -793,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, fs_info, + ret = btrfs_del_csums(trans, + fs_info->csum_root, sums->bytenr, sums->len); if (!ret) @@ -930,54 +946,32 @@ static noinline int backref_in_log(struct btrfs_root *log, const char *name, int namelen) { struct btrfs_path *path; - struct btrfs_inode_ref *ref; - unsigned long ptr; - unsigned long ptr_end; - unsigned long name_ptr; - int found_name_len; - int item_size; int ret; - int match = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret != 0) + if (ret < 0) { goto out; - - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - - if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], - ref_objectid, - name, namelen, NULL)) - match = 1; - + } else if (ret == 1) { + ret = 0; goto out; } - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - ref = (struct btrfs_inode_ref *)ptr; - found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); - if (found_name_len == namelen) { - name_ptr = (unsigned long)(ref + 1); - ret = memcmp_extent_buffer(path->nodes[0], name, - name_ptr, namelen); - if (ret == 0) { - match = 1; - goto out; - } - } - ptr = (unsigned long)(ref + 1) + found_name_len; - } + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, + name, namelen); + else + ret = !!btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name, namelen); out: btrfs_free_path(path); - return match; + return ret; } static inline int __add_inode_ref(struct btrfs_trans_handle *trans, @@ -1035,10 +1029,13 @@ again: (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log_root, &search_key, - parent_objectid, - victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + kfree(victim_name); + return ret; + } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); @@ -1100,10 +1097,12 @@ again: search_key.offset = btrfs_extref_hash(parent_objectid, victim_name, victim_name_len); - ret = 0; - if (!backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + return ret; + } else if (!ret) { ret = -ENOENT; victim_parent = read_one_inode(root, parent_objectid); @@ -1252,12 +1251,12 @@ again: goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen); else - ret = btrfs_find_name_in_backref(log_eb, log_slot, name, - namelen, NULL); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, + name, namelen); if (!ret) { struct inode *dir; @@ -1319,12 +1318,11 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, goto out; } if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, - name, namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, name, namelen); else - ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen, NULL); + ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen); out: btrfs_free_path(path); @@ -1871,30 +1869,6 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* - * Return true if an inode reference exists in the log for the given name, - * inode and parent inode. - */ -static bool name_in_log_ref(struct btrfs_root *log_root, - const char *name, const int name_len, - const u64 dirid, const u64 ino) -{ - struct btrfs_key search_key; - - search_key.objectid = ino; - search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = dirid; - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(dirid, name, name_len); - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - return false; -} - -/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -2010,8 +1984,31 @@ out: return ret; insert: - if (name_in_log_ref(root->log_root, name, name_len, - key->objectid, log_key.objectid)) { + /* + * Check if the inode reference exists in the log for the given name, + * inode and parent inode + */ + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_REF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + if (ret < 0) { + goto out; + } else if (ret) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_EXTREF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + name_len); + if (ret < 0) { + goto out; + } else if (ret) { /* The dentry will be added later. */ ret = 0; update_size = false; @@ -2725,7 +2722,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2809,7 +2806,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2855,7 +2852,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - extent_buffer_get(log->node); + atomic_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -2891,7 +2888,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2918,7 +2915,8 @@ out: * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) + struct btrfs_root *log, + struct btrfs_root_item *root_item) { struct btrfs_fs_info *fs_info = log->fs_info; int ret; @@ -2926,10 +2924,10 @@ static int update_log_root(struct btrfs_trans_handle *trans, if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } else { ret = btrfs_update_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } return ret; } @@ -3027,6 +3025,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -3066,7 +3065,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; mutex_unlock(&root->log_mutex); goto out; @@ -3085,12 +3084,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; } + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); root->log_transid++; log->log_transid = root->log_transid; @@ -3114,9 +3127,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); - ret = update_log_root(trans, log); - mutex_lock(&log_root_tree->log_mutex); + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ cond_wake_up_nomb(&log_root_tree->log_writer_wait); @@ -3127,7 +3146,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -3173,7 +3192,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -3186,7 +3205,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -3196,7 +3215,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log_root_tree, EXTENT_NEW | EXTENT_DIRTY); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3218,7 +3237,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_all_supers(fs_info, 1); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -3305,6 +3324,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* + * Check if an inode was logged in the current transaction. We can't always rely + * on an inode's logged_trans value, because it's an in-memory only field and + * therefore not persisted. This means that its value is lost if the inode gets + * evicted and loaded again from disk (in which case it has a value of 0, and + * certainly it is smaller then any possible transaction ID), when that happens + * the full_sync flag is set in the inode's runtime flags, so on that case we + * assume eviction happened and ignore the logged_trans value, assuming the + * worst case, that the inode was logged before in the current transaction. + */ +static bool inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + if (inode->logged_trans == trans->transid) + return true; + + if (inode->last_trans == trans->transid && + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) + return true; + + return false; +} + +/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -3338,7 +3381,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); - if (dir->logged_trans < trans->transid) + if (!inode_logged(trans, dir)) return 0; ret = join_running_log_trans(root); @@ -3422,7 +3465,7 @@ fail: out_unlock: mutex_unlock(&dir->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, ret); @@ -3438,12 +3481,11 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; - if (inode->logged_trans < trans->transid) + if (!inode_logged(trans, inode)) return 0; ret = join_running_log_trans(root); @@ -3456,7 +3498,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&inode->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); @@ -3801,7 +3843,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); if (log_inode_only) { /* set the generation to zero so the recover code @@ -3868,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return 0; } +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + int ret; + + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + if (ret) + return ret; + + return btrfs_csum_file_blocks(trans, log_root, sums); +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, @@ -4013,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = log_csums(trans, log, sums); list_del(&sums->list); kfree(sums); } @@ -4169,6 +4233,7 @@ fill_holes: *last_extent, 0, 0, len, 0, len, 0, 0, 0); + *last_extent += len; } } } @@ -4232,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = log_csums(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4260,8 +4325,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - btrfs_init_map_token(&token); - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); @@ -4279,6 +4342,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } leaf = path->nodes[0]; + btrfs_init_map_token(&token, leaf); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -4924,7 +4988,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -4934,8 +4998,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = PTR_ERR(inode); if (ret == -ENOENT) { key.objectid = parent; - inode = btrfs_iget(fs_info->sb, &key, root, - NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { @@ -4943,7 +5006,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); } } continue; @@ -4958,7 +5021,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE, 0, LLONG_MAX, ctx); if (ret) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -4967,7 +5030,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5014,7 +5077,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, } path->slots[0]++; } - iput(inode); + btrfs_add_delayed_iput(inode); } return ret; @@ -5402,9 +5465,19 @@ log_extents: } } + /* + * Don't update last_log_commit if we logged that an inode exists after + * it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, then + * the inode gets evicted after all delalloc was flushed, then we log + * it exists (due to a rename for example) and then fsync it. This last + * fsync would do nothing (not logging the extents previously written). + */ spin_lock(&inode->lock); inode->logged_trans = trans->transid; - inode->last_log_commit = inode->last_sub_trans; + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); @@ -5442,7 +5515,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, * Make sure any commits to the log are forced to be full * commits. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = true; } mutex_unlock(&inode->log_mutex); @@ -5464,7 +5537,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct dentry *old_parent = NULL; - struct btrfs_inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -5484,16 +5556,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, } while (1) { - /* - * If we are logging a directory then we start with our inode, - * not our parent's inode, so we need to skip setting the - * logged_trans so that further down in the log code we don't - * think this inode has already been logged. - */ - if (inode != orig_inode) - inode->logged_trans = trans->transid; - smp_mb(); - if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; @@ -5641,14 +5703,14 @@ process_leaf: continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); + di_inode = btrfs_iget(fs_info->sb, &di_key, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto next_dir_inode; } if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { - iput(di_inode); + btrfs_add_delayed_iput(di_inode); break; } @@ -5660,7 +5722,7 @@ process_leaf: if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; - iput(di_inode); + btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; if (ctx->log_new_dentries) { @@ -5767,8 +5829,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, &inode_key, - root, NULL); + dir_inode = btrfs_iget(fs_info->sb, &inode_key, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -5807,7 +5868,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); - iput(dir_inode); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -5819,6 +5880,190 @@ out: return ret; } +static int log_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + while (true) { + struct btrfs_fs_info *fs_info = root->fs_info; + const u64 last_committed = fs_info->last_trans_committed; + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key search_key; + struct inode *inode; + int ret = 0; + + btrfs_release_path(path); + + search_key.objectid = found_key.offset; + search_key.type = BTRFS_INODE_ITEM_KEY; + search_key.offset = 0; + inode = btrfs_iget(fs_info->sb, &search_key, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (BTRFS_I(inode)->generation > last_committed) + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + btrfs_add_delayed_iput(inode); + if (ret) + return ret; + + if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) + break; + + search_key.type = BTRFS_INODE_REF_KEY; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != search_key.objectid || + found_key.type != BTRFS_INODE_REF_KEY) + return -ENOENT; + } + return 0; +} + +static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct dentry *old_parent = NULL; + struct super_block *sb = inode->vfs_inode.i_sb; + int ret = 0; + + while (true) { + if (!parent || d_really_is_negative(parent) || + sb != parent->d_sb) + break; + + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) + break; + + if (inode->generation > fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + if (ret) + break; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + dput(old_parent); + + return ret; +} + +static int log_all_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + /* + * For a single hard link case, go through a fast path that does not + * need to iterate the fs/subvolume tree. + */ + if (inode->vfs_inode.i_nlink < 2) + return log_new_ancestors_fast(trans, inode, parent, ctx); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) + path->slots[0]++; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key found_key; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != ino || + found_key.type > BTRFS_INODE_EXTREF_KEY) + break; + + /* + * Don't deal with extended references because they are rare + * cases and too complex to deal with (we would need to keep + * track of which subitem we are processing for each item in + * this loop, etc). So just return some error to fallback to + * a transaction commit. + */ + if (found_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = -EMLINK; + goto out; + } + + /* + * Logging ancestors needs to do more searches on the fs/subvol + * tree, so it releases the path as needed to avoid deadlocks. + * Keep track of the last inode ref key and resume from that key + * after logging all new ancestors for the current hard link. + */ + memcpy(&search_key, &found_key, sizeof(search_key)); + + ret = log_new_ancestors(trans, root, path, ctx); + if (ret) + goto out; + btrfs_release_path(path); + goto again; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -5836,11 +6081,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; - struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct btrfs_inode *orig_inode = inode; sb = inode->vfs_inode.i_sb; @@ -5946,56 +6189,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and has a link count of 2. */ if (inode->last_unlink_trans > last_committed) { - ret = btrfs_log_all_parents(trans, orig_inode, ctx); + ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; } - /* - * If a new hard link was added to the inode in the current transaction - * and its link count is now greater than 1, we need to fallback to a - * transaction commit, otherwise we can end up not logging all its new - * parents for all the hard links. Here just from the dentry used to - * fsync, we can not visit the ancestor inodes for all the other hard - * links to figure out if any is new, so we fallback to a transaction - * commit (instead of adding a lot of complexity of scanning a btree, - * since this scenario is not a common use case). - */ - if (inode->vfs_inode.i_nlink > 1 && - inode->last_link_trans > last_committed) { - ret = -EMLINK; + ret = log_all_new_ancestors(trans, inode, parent, ctx); + if (ret) goto end_trans; - } - - while (1) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - inode = BTRFS_I(d_inode(parent)); - if (root != inode->root) - break; - - if (inode->generation > last_committed) { - ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - if (ret) - goto end_trans; - } - if (IS_ROOT(parent)) - break; - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - } if (log_dentries) - ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + ret = log_new_dir_dentries(trans, root, inode, ctx); else ret = 0; end_trans: - dput(old_parent); if (ret < 0) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 1; } @@ -6044,7 +6253,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, - .stage = 0, + .stage = LOG_WALK_PIN_ONLY, }; path = btrfs_alloc_path(); @@ -6108,9 +6317,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -6142,7 +6370,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -6150,9 +6377,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); @@ -6222,7 +6450,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * if this directory was already logged any new * names for this file/dir will get recorded */ - smp_mb(); if (dir->logged_trans == trans->transid) return; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0fab84a8f670..132e43d29034 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,16 +30,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, INIT_LIST_HEAD(&ctx->list); } -static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans) +static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { - WRITE_ONCE(fs_info->last_trans_log_full_commit, trans->transid); + WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid); } -static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans) +static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans) { - return READ_ONCE(fs_info->last_trans_log_full_commit) == + return READ_ONCE(trans->fs_info->last_trans_log_full_commit) == trans->transid; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 3b2ae342e649..76b84f2397b1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -121,12 +121,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(fs_info, path, sizeof(subid_le)); + btrfs_extend_item(path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); - } else if (ret < 0) { + } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, @@ -219,7 +219,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1); + btrfs_truncate_item(path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); @@ -324,6 +324,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db934ceae9c1..9b78e720c697 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -24,9 +25,11 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" -#include "math.h" #include "dev-replace.h" #include "sysfs.h" +#include "tree-checker.h" +#include "space-info.h" +#include "block-group.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -55,6 +58,30 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, + [BTRFS_RAID_RAID1C3] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 3, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 3, + .ncopies = 3, + .raid_name = "raid1c3", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C4] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 4, + .devs_min = 4, + .tolerated_failures = 3, + .devs_increment = 4, + .ncopies = 4, + .raid_name = "raid1c4", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, + }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, .dev_stripes = 2, @@ -122,12 +149,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const char *get_raid_name(enum btrfs_raid_types type) +const char *btrfs_bg_type_to_raid_name(u64 flags) { - if (type >= BTRFS_NR_RAID_TYPES) + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) return NULL; - return btrfs_raid_array[type].raid_name; + return btrfs_raid_array[index].raid_name; } /* @@ -184,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) out_overflow:; } -static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -237,7 +264,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new - * device is added/removed + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- @@ -292,7 +321,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -struct list_head *btrfs_get_fs_uuids(void) +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } @@ -318,7 +347,6 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); - INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) @@ -334,7 +362,9 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, void btrfs_free_device(struct btrfs_device *device) { + WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); + extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); kfree(device); } @@ -352,19 +382,6 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } -static void btrfs_kobject_uevent(struct block_device *bdev, - enum kobject_action action) -{ - int ret; - - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); - if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", - action, - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), - &disk_to_dev(bdev->bd_disk)->kobj); -} - void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -402,15 +419,14 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); - INIT_LIST_HEAD(&dev->resized_list); - - spin_lock_init(&dev->io_lock); + INIT_LIST_HEAD(&dev->post_commit_list); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); return dev; } @@ -507,212 +523,6 @@ error: return ret; } -static void requeue_list(struct btrfs_pending_bios *pending_bios, - struct bio *head, struct bio *tail) -{ - - struct bio *old_head; - - old_head = pending_bios->head; - pending_bios->head = head; - if (pending_bios->tail) - tail->bi_next = old_head; - else - pending_bios->tail = tail; -} - -/* - * we try to collect pending bios for a device so we don't get a large - * number of procs sending bios down to the same device. This greatly - * improves the schedulers ability to collect and merge the bios. - * - * But, it also turns into a long list of bios to process and that is sure - * to eventually make the worker thread block. The solution here is to - * make some progress and then put this work struct back at the end of - * the list if the block device is congested. This way, multiple devices - * can make progress from a single worker thread. - */ -static noinline void run_scheduled_bios(struct btrfs_device *device) -{ - struct btrfs_fs_info *fs_info = device->fs_info; - struct bio *pending; - struct backing_dev_info *bdi; - struct btrfs_pending_bios *pending_bios; - struct bio *tail; - struct bio *cur; - int again = 0; - unsigned long num_run; - unsigned long batch_run = 0; - unsigned long last_waited = 0; - int force_reg = 0; - int sync_pending = 0; - struct blk_plug plug; - - /* - * this function runs all the bios we've collected for - * a particular device. We don't want to wander off to - * another device without first sending all of these down. - * So, setup a plug here and finish it off before we return - */ - blk_start_plug(&plug); - - bdi = device->bdev->bd_bdi; - -loop: - spin_lock(&device->io_lock); - -loop_lock: - num_run = 0; - - /* take all the bios off the list at once and process them - * later on (without the lock held). But, remember the - * tail and other pointers so the bios can be properly reinserted - * into the list if we hit congestion - */ - if (!force_reg && device->pending_sync_bios.head) { - pending_bios = &device->pending_sync_bios; - force_reg = 1; - } else { - pending_bios = &device->pending_bios; - force_reg = 0; - } - - pending = pending_bios->head; - tail = pending_bios->tail; - WARN_ON(pending && !tail); - - /* - * if pending was null this time around, no bios need processing - * at all and we can stop. Otherwise it'll loop back up again - * and do an additional check so no bios are missed. - * - * device->running_pending is used to synchronize with the - * schedule_bio code. - */ - if (device->pending_sync_bios.head == NULL && - device->pending_bios.head == NULL) { - again = 0; - device->running_pending = 0; - } else { - again = 1; - device->running_pending = 1; - } - - pending_bios->head = NULL; - pending_bios->tail = NULL; - - spin_unlock(&device->io_lock); - - while (pending) { - - rmb(); - /* we want to work on both lists, but do more bios on the - * sync list than the regular list - */ - if ((num_run > 32 && - pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head) || - (num_run > 64 && pending_bios == &device->pending_sync_bios && - device->pending_bios.head)) { - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - goto loop_lock; - } - - cur = pending; - pending = pending->bi_next; - cur->bi_next = NULL; - - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); - - /* - * if we're doing the sync list, record that our - * plug has some sync requests on it - * - * If we're doing the regular list and there are - * sync requests sitting around, unplug before - * we add more - */ - if (pending_bios == &device->pending_sync_bios) { - sync_pending = 1; - } else if (sync_pending) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } - - btrfsic_submit_bio(cur); - num_run++; - batch_run++; - - cond_resched(); - - /* - * we made progress, there is more work to do and the bdi - * is now congested. Back off and let other work structs - * run instead - */ - if (pending && bdi_write_congested(bdi) && batch_run > 8 && - fs_info->fs_devices->open_devices > 1) { - struct io_context *ioc; - - ioc = current->io_context; - - /* - * the main goal here is that we don't want to - * block if we're going to be able to submit - * more requests without blocking. - * - * This code does two great things, it pokes into - * the elevator code from a filesystem _and_ - * it makes assumptions about how batching works. - */ - if (ioc && ioc->nr_batch_requests > 0 && - time_before(jiffies, ioc->last_waited + HZ/50UL) && - (last_waited == 0 || - ioc->last_waited == last_waited)) { - /* - * we want to go through our batch of - * requests and stop. So, we copy out - * the ioc->last_waited time and test - * against it before looping - */ - last_waited = ioc->last_waited; - cond_resched(); - continue; - } - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - device->running_pending = 1; - - spin_unlock(&device->io_lock); - btrfs_queue_work(fs_info->submit_workers, - &device->work); - goto done; - } - } - - cond_resched(); - if (again) - goto loop; - - spin_lock(&device->io_lock); - if (device->pending_bios.head || device->pending_sync_bios.head) - goto loop_lock; - spin_unlock(&device->io_lock); - -done: - blk_finish_plug(&plug); -} - -static void pending_bios_fn(struct btrfs_work *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, work); - run_scheduled_bios(device); -} - static bool device_path_matched(const char *path, struct btrfs_device *device) { int found; @@ -824,7 +634,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - fs_devices->seeding = 1; + fs_devices->seeding = true; } else { if (bdev_read_only(bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -834,7 +644,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -1011,11 +821,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, *new_device_added = true; if (disk_super->label[0]) - pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", - disk_super->label, devid, found_transid, path); + pr_info( + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->label, devid, found_transid, path, + current->comm, task_pid_nr(current)); else - pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", - disk_super->fsid, devid, found_transid, path); + pr_info( + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->fsid, devid, found_transid, path, + current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { /* @@ -1121,6 +935,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; + int ret = 0; fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) @@ -1134,8 +949,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid); - if (IS_ERR(device)) + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; + } /* * This is ok to do without rcu read locked because we hold the @@ -1146,6 +963,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) GFP_KERNEL); if (!name) { btrfs_free_device(device); + ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); @@ -1160,7 +978,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) error: mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } /* @@ -1230,14 +1048,6 @@ again: mutex_unlock(&uuid_mutex); } -static void free_device_rcu(struct rcu_head *head) -{ - struct btrfs_device *device; - - device = container_of(head, struct btrfs_device, rcu); - btrfs_free_device(device); -} - static void btrfs_close_bdev(struct btrfs_device *device) { if (!device->bdev) @@ -1285,7 +1095,8 @@ static void btrfs_close_one_device(struct btrfs_device *device) list_replace_rcu(&device->dev_list, &new_device->dev_list); new_device->fs_devices = device->fs_devices; - call_rcu(&device->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(device); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -1304,7 +1115,7 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; - fs_devices->seeding = 0; + fs_devices->seeding = false; return 0; } @@ -1505,58 +1316,29 @@ error_bdev_put: return device; } -static int contains_pending_extent(struct btrfs_transaction *transaction, - struct btrfs_device *device, - u64 *start, u64 len) +/* + * Try to find a chunk that intersects [start, start + len] range and when one + * such is found, record the end of it in *start + */ +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, + u64 len) { - struct btrfs_fs_info *fs_info = device->fs_info; - struct extent_map *em; - struct list_head *search_list = &fs_info->pinned_chunks; - int ret = 0; - u64 physical_start = *start; + u64 physical_start, physical_end; - if (transaction) - search_list = &transaction->pending_chunks; -again: - list_for_each_entry(em, search_list, list) { - struct map_lookup *map; - int i; + lockdep_assert_held(&device->fs_info->chunk_mutex); - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) { - u64 end; + if (!find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { - if (map->stripes[i].dev != device) - continue; - if (map->stripes[i].physical >= physical_start + len || - map->stripes[i].physical + em->orig_block_len <= - physical_start) - continue; - /* - * Make sure that while processing the pinned list we do - * not override our *start with a lower value, because - * we can have pinned chunks that fall within this - * device hole and that have lower physical addresses - * than the pending chunks we processed before. If we - * do not take this special care we can end up getting - * 2 pending chunks that start at the same physical - * device offsets because the end offset of a pinned - * chunk can be equal to the start offset of some - * pending chunk. - */ - end = map->stripes[i].physical + em->orig_block_len; - if (end > *start) { - *start = end; - ret = 1; - } + if (in_range(physical_start, *start, len) || + in_range(*start, physical_start, + physical_end - physical_start)) { + *start = physical_end + 1; + return true; } } - if (search_list != &fs_info->pinned_chunks) { - search_list = &fs_info->pinned_chunks; - goto again; - } - - return ret; + return false; } @@ -1580,10 +1362,16 @@ again: * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. + * + * NOTE: This function will search *commit* root of device tree, and does extra + * check to ensure dev extents are not double allocated. + * This makes the function safe to allocate dev extents but may not report + * correct usable device space, as device extent freed in current transaction + * is not reported as avaiable. */ -int find_free_dev_extent_start(struct btrfs_transaction *transaction, - struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *len) +static int find_free_dev_extent_start(struct btrfs_device *device, + u64 num_bytes, u64 search_start, u64 *start, + u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -1667,15 +1455,12 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(transaction, device, - &search_start, + if (contains_pending_extent(device, &search_start, hole_size)) { - if (key.offset >= search_start) { + if (key.offset >= search_start) hole_size = key.offset - search_start; - } else { - WARN_ON_ONCE(1); + else hole_size = 0; - } } if (hole_size > max_hole_size) { @@ -1716,8 +1501,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(transaction, device, &search_start, - hole_size)) { + if (contains_pending_extent(device, &search_start, hole_size)) { btrfs_release_path(path); goto again; } @@ -1742,13 +1526,11 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { /* FIXME use last free of some kind */ - return find_free_dev_extent_start(trans->transaction, device, - num_bytes, 0, start, len); + return find_free_dev_extent_start(device, num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1859,7 +1641,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); n = rb_last(&em_tree->map.rb_root); if (n) { @@ -1891,7 +1673,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* Corruption */ + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, @@ -1982,10 +1769,9 @@ static void update_dev_time(const char *path_name) filp_close(filp, NULL); } -static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, - struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_device *device) { - struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -2082,7 +1868,7 @@ static struct btrfs_device * btrfs_find_next_active_device( * where this function called, there should be always be another device (or * this_dev) which is active. */ -void btrfs_assign_next_active_device(struct btrfs_device *device, +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev) { struct btrfs_fs_info *fs_info = device->fs_info; @@ -2186,12 +1972,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * counter although write_all_supers() is not locked out. This * could give a filesystem state which requires a degraded mount. */ - ret = btrfs_rm_dev_item(fs_info, device); + ret = btrfs_rm_dev_item(device); if (ret) goto error_undo; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - btrfs_scrub_cancel_dev(fs_info, device); + btrfs_scrub_cancel_dev(device); /* * the device list mutex makes sure that we don't change @@ -2242,7 +2028,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(device); if (cur_devices->open_devices == 0) { while (fs_devices) { @@ -2299,9 +2086,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) fs_devices->open_devices--; } -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { + struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { @@ -2310,7 +2097,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(srcdev); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { @@ -2368,7 +2156,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(tgtdev); } static struct btrfs_device *btrfs_find_device_by_path( @@ -2481,11 +2270,11 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = 0; + fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; - fs_devices->rotating = 0; + fs_devices->rotating = false; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2503,9 +2292,9 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) /* * Store the expected generation for seed devices in device items. */ -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -2680,7 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, @@ -2705,7 +2494,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); - ret = init_first_rw_device(trans, fs_info); + ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2720,22 +2509,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - - ret = btrfs_finish_sprout(trans, fs_info); + ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } - /* Sprouting would change fsid of the mounted root, - * so rename the fsid on the sysfs - */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fs_devices->fsid); - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) - btrfs_warn(fs_info, - "sysfs: failed to create fsid for sprout"); + btrfs_sysfs_update_sprout_fsid(fs_devices, + fs_info->fs_devices->fsid); } ret = btrfs_commit_transaction(trans); @@ -2852,7 +2633,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_super_block *super_copy = fs_info->super_copy; - struct btrfs_fs_devices *fs_devices; u64 old_total; u64 diff; @@ -2871,8 +2651,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, return -EINVAL; } - fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; @@ -2880,9 +2658,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->fs_info); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); return btrfs_update_device(trans, device); @@ -2983,7 +2761,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); @@ -3113,10 +2891,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) */ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); - ret = btrfs_can_relocate(fs_info, chunk_offset); - if (ret) - return -ENOSPC; - /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); @@ -3124,16 +2898,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; - /* - * We add the kobjects here (and after forcing data chunk creation) - * since relocation is the only place we'll create chunks of a new - * type at runtime. The only place where we'll remove the last - * chunk of a type is the call immediately below this one. Even - * so, we're protected against races with the cleaner thread since - * we're covered by the delete_unused_bgs_mutex. - */ - btrfs_add_raid_kobjects(fs_info); - trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3233,7 +2997,7 @@ error: static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 bytes_used; u64 chunk_type; @@ -3242,30 +3006,28 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, chunk_type = cache->flags; btrfs_put_block_group(cache); - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); - - if (!bytes_used) { - struct btrfs_trans_handle *trans; - int ret; + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) + return 0; - trans = btrfs_join_transaction(fs_info->tree_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); - ret = btrfs_force_chunk_alloc(trans, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); - if (ret < 0) - return ret; + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; - btrfs_add_raid_kobjects(fs_info); + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - return 1; - } + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + return 1; } + return 0; } @@ -3444,28 +3206,28 @@ static int chunk_profiles_filter(u64 chunk_type, static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->key.offset, - bargs->usage_min); + user_thresh_min = div_factor_fine(cache->length, + bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) - user_thresh_max = cache->key.offset; + user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->key.offset, - bargs->usage_max); + user_thresh_max = div_factor_fine(cache->length, + bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3477,20 +3239,19 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) - user_thresh = cache->key.offset; + user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->key.offset, - bargs->usage); + user_thresh = div_factor_fine(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -3516,6 +3277,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + if (nparity) + return num_stripes - nparity; + else + return num_stripes / ncopies; +} + /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, @@ -3525,22 +3298,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; + u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { - factor = num_stripes / 2; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { - factor = num_stripes - 1; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { - factor = num_stripes - 2; - } else { - factor = num_stripes; - } + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -3601,10 +3367,10 @@ static int chunk_soft_convert_filter(u64 chunk_type, return 0; } -static int should_balance_chunk(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); @@ -3784,8 +3550,7 @@ again: spin_unlock(&fs_info->balance_lock); } - ret = should_balance_chunk(fs_info, leaf, chunk, - found_key.offset); + ret = should_balance_chunk(leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) { @@ -3899,8 +3664,7 @@ static int alloc_profile_is_valid(u64 flags, int extended) if (flags == 0) return !extended; /* "0" is valid for usual profiles */ - /* true if exactly one bit set */ - return is_power_of_2(flags); + return has_single_bit_set(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -3964,11 +3728,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, bp += ret; \ } while (0) - if (flags & BTRFS_BALANCE_ARGS_CONVERT) { - int index = btrfs_bg_flags_to_raid_index(bargs->target); - - CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); - } + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); @@ -4089,7 +3851,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; - bool reducing_integrity; + bool reducing_redundancy; + int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -4118,49 +3881,54 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = btrfs_num_devices(fs_info); + /* + * rw_devices will not change at the moment, device add/delete/replace + * are excluded by EXCL_OP + */ + num_devices = fs_info->fs_devices->rw_devices; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; - if (num_devices > 1) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices > 2) - allowed |= BTRFS_BLOCK_GROUP_RAID5; - if (num_devices > 3) - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID6); - if (validate_convert_profile(&bctl->data, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; + if (validate_convert_profile(&bctl->data, allowed)) { btrfs_err(fs_info, "balance: invalid convert data profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->data.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); - btrfs_err(fs_info, "balance: invalid convert metadata profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->meta.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); - btrfs_err(fs_info, "balance: invalid convert system profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->sys.target)); ret = -EINVAL; goto out; } - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6; + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -4170,9 +3938,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) - reducing_integrity = true; + reducing_redundancy = true; else - reducing_integrity = false; + reducing_redundancy = false; /* if we're not converting, the target field is uninitialized */ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? @@ -4181,13 +3949,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (reducing_integrity) { + if (reducing_redundancy) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "balance: force reducing metadata integrity"); + "balance: force reducing metadata redundancy"); } else { btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); + "balance: reduces metadata redundancy, use --force if you want this"); ret = -EINVAL; goto out; } @@ -4195,12 +3963,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); - int data_index = btrfs_bg_flags_to_raid_index(data_target); - btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", - get_raid_name(meta_index), get_raid_name(data_index)); + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); + } + + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run balance while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + ret = -EAGAIN; + goto out; } ret = insert_balance_item(fs_info, bctl); @@ -4661,8 +4435,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) return PTR_ERR(trans); - uuid_root = btrfs_create_tree(trans, fs_info, - BTRFS_UUID_TREE_OBJECTID); + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, ret); @@ -4722,15 +4495,16 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) int slot; int failed = 0; bool retried = false; - bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; + u64 start; new_size = round_down(new_size, fs_info->sectorsize); + start = new_size; diff = round_down(old_size - new_size, fs_info->sectorsize); if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) @@ -4742,6 +4516,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) path->reada = READA_BACK; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); @@ -4749,7 +4529,21 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } - mutex_unlock(&fs_info->chunk_mutex); + + /* + * Once the device's size has been set to the new size, ensure all + * in-memory chunks are synced to disk so that the loop below sees them + * and relocates them accordingly. + */ + if (contains_pending_extent(device, &start, diff)) { + mutex_unlock(&fs_info->chunk_mutex); + ret = btrfs_commit_transaction(trans); + if (ret) + goto done; + } else { + mutex_unlock(&fs_info->chunk_mutex); + btrfs_end_transaction(trans); + } again: key.objectid = device->devid; @@ -4840,40 +4634,10 @@ again: } mutex_lock(&fs_info->chunk_mutex); - - /* - * We checked in the above loop all device extents that were already in - * the device tree. However before we have updated the device's - * total_bytes to the new size, we might have had chunk allocations that - * have not complete yet (new block groups attached to transaction - * handles), and therefore their device extents were not yet in the - * device tree and we missed them in the loop above. So if we have any - * pending chunk using a device extent that overlaps the device range - * that we can not use anymore, commit the current transaction and - * repeat the search on the device tree - this way we guarantee we will - * not have chunks using device extents that end beyond 'new_size'. - */ - if (!checked_pending_chunks) { - u64 start = new_size; - u64 len = old_size - new_size; - - if (contains_pending_extent(trans->transaction, device, - &start, len)) { - mutex_unlock(&fs_info->chunk_mutex); - checked_pending_chunks = true; - failed = 0; - retried = false; - ret = btrfs_commit_transaction(trans); - if (ret) - goto done; - goto again; - } - } - btrfs_device_set_disk_total_bytes(device, new_size); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_info->fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, @@ -4957,14 +4721,13 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - - sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) + return; -#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ - - 2 * sizeof(struct btrfs_disk_key) \ - - 2 * sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) + btrfs_set_fs_incompat(info, RAID1C34); +} static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) @@ -5011,6 +4774,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; devs_max = btrfs_raid_array[index].devs_max; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info); devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; @@ -5019,8 +4784,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -5028,17 +4791,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; + devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - BUG_ON(1); + BUG(); } /* We don't want a chunk larger than 10% of writable space */ @@ -5079,7 +4839,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -5115,8 +4875,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - /* round down to number of usable stripes */ - ndevs = round_down(ndevs, devs_increment); + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() + */ + ndevs -= ndevs % devs_increment; if (ndevs < devs_min) { ret = -ENOSPC; @@ -5205,7 +4968,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->block_len = em->len; em->orig_block_len = stripe_size; - em_tree = &info->mapping_tree.map_tree; + em_tree = &info->mapping_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret) { @@ -5213,23 +4976,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); goto error; } - - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); write_unlock(&em_tree->lock); ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) - btrfs_device_set_bytes_used(map->stripes[i].dev, - map->stripes[i].dev->bytes_used + stripe_size); + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); + if (list_empty(&dev->post_commit_list)) + list_add_tail(&dev->post_commit_list, + &trans->transaction->dev_update_list); + } atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); kfree(devices_info); return 0; @@ -5243,8 +5009,6 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); - /* One for the pending_chunks list reference */ - free_extent_map(em); error: kfree(devices_info); return ret; @@ -5364,9 +5128,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) return __btrfs_alloc_chunk(trans, chunk_offset, type); } -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; @@ -5386,20 +5150,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, static inline int btrfs_chunk_max_errors(struct map_lookup *map) { - int max_errors; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } else { - max_errors = 0; - } + const int index = btrfs_bg_flags_to_raid_index(map->type); - return max_errors; + return btrfs_raid_array[index].tolerated_failures; } int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) @@ -5440,21 +5193,16 @@ end: return readonly; } -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +void btrfs_mapping_tree_free(struct extent_map_tree *tree) { struct extent_map *em; while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); if (!em) break; /* once for us */ @@ -5481,7 +5229,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5555,7 +5303,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5666,12 +5414,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) * replace. */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, + u64 logical, u64 *length_ret, struct btrfs_bio **bbio_ret) { struct extent_map *em; struct map_lookup *map; struct btrfs_bio *bbio; + u64 length = *length_ret; u64 offset; u64 stripe_nr; u64 stripe_nr_end; @@ -5704,7 +5453,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } offset = logical - em->start; - length = min_t(u64, em->len - offset, length); + length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; stripe_len = map->stripe_len; /* @@ -5744,7 +5494,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -5988,6 +5738,106 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } +/* + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) + * tuple. This information is used to calculate how big a + * particular bio can get before it straddles a stripe. + * + * @fs_info - the filesystem + * @logical - address that we want to figure out the geometry of + * @len - the length of IO we are going to perform, starting at @logical + * @op - type of operation - write or read + * @io_geom - pointer used to return values + * + * Returns < 0 in case a chunk for the given logical address cannot be found, + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. + */ +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +{ + struct extent_map *em; + struct map_lookup *map; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; + u64 raid56_full_stripe_start = (u64)-1; + int data_stripes; + int ret = 0; + + ASSERT(op != BTRFS_MAP_DISCARD); + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* Offset of this logical address in the chunk */ + offset = logical - em->start; + /* Len of a stripe in a chunk */ + stripe_len = map->stripe_len; + /* Stripe wher this block falls in */ + stripe_nr = div64_u64(offset, stripe_len); + /* Offset of stripe in the chunk */ + stripe_offset = stripe_nr * stripe_len; + if (offset < stripe_offset) { + btrfs_crit(fs_info, +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", + stripe_offset, offset, em->start, logical, stripe_len); + ret = -EINVAL; + goto out; + } + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* + * In case of raid56, we need to know the stripe aligned start + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * data_stripes; + raid56_full_stripe_start = offset; + + /* + * Allow a write of a full stripe, but make sure we + * don't allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + + /* + * For writes to RAID[56], allow a full stripeset across + * all disks. For other RAID types and for RAID[56] + * reads, just allow a single stripe (on a single disk). + */ + if (op == BTRFS_MAP_WRITE) { + max_len = stripe_len * data_stripes - + (offset - raid56_full_stripe_start); + } + } + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + io_geom->len = len; + io_geom->offset = offset; + io_geom->stripe_len = stripe_len; + io_geom->stripe_nr = stripe_nr; + io_geom->stripe_offset = stripe_offset; + io_geom->raid56_stripe_offset = raid56_full_stripe_start; + +out: + /* once for us */ + free_extent_map(em); + return ret; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5996,11 +5846,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - u64 offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; u32 stripe_index; + int data_stripes; int i; int ret = 0; int num_stripes; @@ -6013,76 +5863,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + struct btrfs_io_geometry geom; + + ASSERT(bbio_ret); if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - *length, bbio_ret); + length, bbio_ret); - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + if (ret < 0) + return ret; + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); map = em->map_lookup; - offset = logical - em->start; - - stripe_len = map->stripe_len; - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - stripe_nr = div64_u64(stripe_nr, stripe_len); - - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", - stripe_offset, offset, em->start, logical, - stripe_len); - free_extent_map(em); - return -EINVAL; - } - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - raid56_full_stripe_start = offset; - - /* allow a write of a full stripe, but make sure we don't - * allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - } - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - u64 max_len; - /* For writes to RAID[56], allow a full stripeset across all disks. - For other RAID types and for RAID[56] reads, just allow a single - stripe (on a single disk). */ - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * nr_data_stripes(map) - - (offset - raid56_full_stripe_start); - } else { - /* we limit the length of each bio to what fits in a stripe */ - max_len = stripe_len - stripe_offset; - } - *length = min_t(u64, em->len - offset, max_len); - } else { - *length = em->len - offset; - } - - /* - * This is for when we're called from btrfs_bio_fits_in_stripe and all - * it cares about is the length - */ - if (!bbio_ret) - goto out; + *length = geom.len; + stripe_len = geom.stripe_len; + stripe_nr = geom.stripe_nr; + stripe_offset = geom.stripe_offset; + raid56_full_stripe_start = geom.raid56_stripe_offset; + data_stripes = nr_data_stripes(map); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6114,7 +5916,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) @@ -6156,7 +5958,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * nr_data_stripes(map)); + stripe_len * data_stripes); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6172,10 +5974,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * Mirror #3 is RAID6 Q block. */ stripe_nr = div_u64_rem(stripe_nr, - nr_data_stripes(map), &stripe_index); + data_stripes, &stripe_index); if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + - mirror_num - 2; + stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, @@ -6233,8 +6034,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) + tmp = stripe_nr * data_stripes; + for (i = 0; i < data_stripes; i++) bbio->raid_map[(i+rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; @@ -6448,52 +6249,8 @@ static void btrfs_end_bio(struct bio *bio) } } -/* - * see run_scheduled_bios for a description of why bios are collected for - * async submit. - * - * This will add one bio to the pending list for a device and make sure - * the work struct is scheduled. - */ -static noinline void btrfs_schedule_bio(struct btrfs_device *device, - struct bio *bio) -{ - struct btrfs_fs_info *fs_info = device->fs_info; - int should_queue = 1; - struct btrfs_pending_bios *pending_bios; - - /* don't bother with additional async steps for reads, right now */ - if (bio_op(bio) == REQ_OP_READ) { - btrfsic_submit_bio(bio); - return; - } - - WARN_ON(bio->bi_next); - bio->bi_next = NULL; - - spin_lock(&device->io_lock); - if (op_is_sync(bio->bi_opf)) - pending_bios = &device->pending_sync_bios; - else - pending_bios = &device->pending_bios; - - if (pending_bios->tail) - pending_bios->tail->bi_next = bio; - - pending_bios->tail = bio; - if (!pending_bios->head) - pending_bios->head = bio; - if (device->running_pending) - should_queue = 0; - - spin_unlock(&device->io_lock); - - if (should_queue) - btrfs_queue_work(fs_info->submit_workers, &device->work); -} - static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, - u64 physical, int dev_nr, int async) + u64 physical, int dev_nr) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; struct btrfs_fs_info *fs_info = bbio->fs_info; @@ -6511,10 +6268,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_bio_counter_inc_noblocked(fs_info); - if (async) - btrfs_schedule_bio(dev, bio); - else - btrfsic_submit_bio(bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) @@ -6535,7 +6289,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) + int mirror_num) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6604,7 +6358,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, - dev_nr, async_submit); + dev_nr); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; @@ -6708,105 +6462,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, btrfs_submit_helper, - pending_bios_fn, NULL, NULL); - return dev; } -/* Return -EIO if any error, otherwise return 0. */ -static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) -{ - u64 length; - u64 stripe_len; - u16 num_stripes; - u16 sub_stripes; - u64 type; - u64 features; - bool mixed = false; - - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); - - if (!num_stripes) { - btrfs_err(fs_info, "invalid chunk num_stripes: %u", - num_stripes); - return -EIO; - } - if (!IS_ALIGNED(logical, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk logical %llu", logical); - return -EIO; - } - if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { - btrfs_err(fs_info, "invalid chunk sectorsize %u", - btrfs_chunk_sector_size(leaf, chunk)); - return -EIO; - } - if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk length %llu", length); - return -EIO; - } - if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { - btrfs_err(fs_info, "invalid chunk stripe length: %llu", - stripe_len); - return -EIO; - } - if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - type) { - btrfs_err(fs_info, "unrecognized chunk type: %llu", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { - btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && - (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(fs_info, - "system chunk with data or metadata type: 0x%llx", type); - return -EIO; - } - - features = btrfs_super_incompat_flags(fs_info->super_copy); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = true; - - if (!mixed) { - if ((type & BTRFS_BLOCK_GROUP_METADATA) && - (type & BTRFS_BLOCK_GROUP_DATA)) { - btrfs_err(fs_info, - "mixed chunk type in non-mixed mode: 0x%llx", type); - return -EIO; - } - } - - if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != 1)) { - btrfs_err(fs_info, - "invalid num_stripes:sub_stripes %u:%u for profile %llu", - num_stripes, sub_stripes, - type & BTRFS_BLOCK_GROUP_PROFILE_MASK); - return -EIO; - } - - return 0; -} - static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, bool error) { @@ -6818,11 +6476,31 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct extent_buffer *leaf, +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +{ + int index = btrfs_bg_flags_to_raid_index(type); + int ncopies = btrfs_raid_array[index].ncopies; + int data_stripes; + + switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID5: + data_stripes = num_stripes - 1; + break; + case BTRFS_BLOCK_GROUP_RAID6: + data_stripes = num_stripes - 2; + break; + default: + data_stripes = num_stripes / ncopies; + break; + } + return div_u64(chunk_len, data_stripes); +} + +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6837,13 +6515,19 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, length = btrfs_chunk_length(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); - if (ret) - return ret; + /* + * Only need to verify chunk item if we're reading from sys chunk array, + * as chunk item in tree block is already verified by tree-checker. + */ + if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { + ret = btrfs_check_chunk_valid(leaf, chunk, logical); + if (ret) + return ret; + } - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -6877,6 +6561,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, map->type = btrfs_chunk_type(leaf, chunk); map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; + em->orig_block_len = calc_stripe_length(map->type, em->len, + map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6910,9 +6596,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, } - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em, 0); - write_unlock(&map_tree->map_tree.lock); + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", @@ -6972,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->seeding = 1; + fs_devices->seeding = true; fs_devices->opened = 1; return fs_devices; } @@ -7001,10 +6687,10 @@ out: return fs_devices; } -static int read_one_dev(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; @@ -7161,48 +6847,49 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be - * present, exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + (u32)key.type, cur_offset); + ret = -EIO; + break; + } - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be present, + * exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + btrfs_err(fs_info, + "invalid number of stripes %u in sys_array at offset %u", + num_stripes, cur_offset); + ret = -EIO; + break; + } - ret = read_one_chunk(fs_info, &key, sb, chunk); - if (ret) - break; - } else { + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); ret = -EIO; break; } + + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + + ret = read_one_chunk(&key, sb, chunk); + if (ret) + break; + array_ptr += len; sb_array_offset += len; cur_offset += len; @@ -7230,14 +6917,14 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; u64 next_start = 0; bool ret = true; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); /* No chunk at all? Return false anyway */ if (!em) { ret = false; @@ -7275,10 +6962,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, next_start = extent_map_end(em); free_extent_map(em); - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, next_start, + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, (u64)(-1) - next_start); - read_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->lock); } out: return ret; @@ -7334,14 +7021,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(fs_info, leaf, dev_item); + ret = read_one_dev(leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = read_one_chunk(fs_info, &found_key, leaf, chunk); + ret = read_one_chunk(&found_key, leaf, chunk); if (ret) goto error; } @@ -7393,18 +7080,32 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) } } -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, + int index) { - int i; + u64 val; - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_reset(dev, i); + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_key key; - struct btrfs_key found_key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct extent_buffer *eb; @@ -7415,10 +7116,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) int i; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { @@ -7430,14 +7129,14 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - __btrfs_reset_dev_stats(device); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); continue; } slot = path->slots[0]; eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); item_size = btrfs_item_size_nr(eb, slot); ptr = btrfs_item_ptr(eb, slot, @@ -7448,7 +7147,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) btrfs_dev_stat_set(device, i, btrfs_dev_stats_value(eb, ptr, i)); else - btrfs_dev_stat_reset(device, i); + btrfs_dev_stat_set(device, i, 0); } device->dev_stats_valid = 1; @@ -7457,7 +7156,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); -out: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -7530,9 +7228,9 @@ out: /* * called from commit_transaction. Writes all changed device stats to disk. */ -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; @@ -7631,7 +7329,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else - btrfs_dev_stat_reset(dev, i); + btrfs_dev_stat_set(dev, i, 0); } } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) @@ -7674,51 +7372,34 @@ void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_pat } /* - * Update the size of all devices, which is used for writing out the - * super blocks. + * Update the size and bytes used for each device where it changed. This is + * delayed since we would otherwise get errors while writing out the + * superblocks. + * + * Must be invoked during transaction commit. */ -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *curr, *next; - if (list_empty(&fs_devices->resized_devices)) - return; - - mutex_lock(&fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, - resized_list) { - list_del_init(&curr->resized_list); - curr->commit_total_bytes = curr->disk_total_bytes; - } - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_devices->device_list_mutex); -} + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); -/* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; - struct btrfs_device *dev; - int i; - - if (list_empty(&trans->pending_chunks)) + if (list_empty(&trans->dev_update_list)) return; - /* In order to kick the device replace finish process */ - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(em, &trans->pending_chunks, list) { - map = em->map_lookup; - - for (i = 0; i < map->num_stripes; i++) { - dev = map->stripes[i].dev; - dev->commit_bytes_used = dev->bytes_used; - } + /* + * We don't need the device_list_mutex here. This list is owned by the + * transaction and the transaction must complete before the device is + * released. + */ + mutex_lock(&trans->fs_info->chunk_mutex); + list_for_each_entry_safe(curr, next, &trans->dev_update_list, + post_commit_list) { + list_del_init(&curr->post_commit_list); + curr->commit_total_bytes = curr->disk_total_bytes; + curr->commit_bytes_used = curr->bytes_used; } - mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&trans->fs_info->chunk_mutex); } void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) @@ -7744,38 +7425,18 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) */ int btrfs_bg_type_to_factor(u64 flags) { - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return 2; - return 1; + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) -{ - int index = btrfs_bg_flags_to_raid_index(type); - int ncopies = btrfs_raid_array[index].ncopies; - int data_stripes; - - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: - data_stripes = num_stripes / ncopies; - break; - } - return div_u64(chunk_len, data_stripes); -} static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; @@ -7864,7 +7525,7 @@ out: static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct rb_node *node; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3ad9d58d1b66..fc1b564b9cfe 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -18,9 +18,20 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; -struct btrfs_pending_bios { - struct bio *head; - struct bio *tail; + +struct btrfs_io_geometry { + /* remaining bytes before crossing a stripe */ + u64 len; + /* offset of logical address in chunk */ + u64 offset; + /* length of single IO stripe */ + u64 stripe_len; + /* number of stripe where address falls */ + u64 stripe_nr; + /* offset of address in stripe */ + u64 stripe_offset; + /* offset of raid56 stripe into the chunk */ + u64 raid56_stripe_offset; }; /* @@ -43,8 +54,9 @@ struct btrfs_pending_bios { #define BTRFS_DEV_STATE_FLUSH_SENT (4) struct btrfs_device { - struct list_head dev_list; - struct list_head dev_alloc_list; + struct list_head dev_list; /* device_list_mutex */ + struct list_head dev_alloc_list; /* chunk mutex */ + struct list_head post_commit_list; /* chunk mutex */ struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; @@ -52,13 +64,6 @@ struct btrfs_device { u64 generation; - spinlock_t io_lock ____cacheline_aligned; - int running_pending; - /* regular prio bios */ - struct btrfs_pending_bios pending_bios; - /* sync bios */ - struct btrfs_pending_bios pending_sync_bios; - struct block_device *bdev; /* the mode sent to blkdev_get */ @@ -66,7 +71,6 @@ struct btrfs_device { unsigned long dev_state; blk_status_t last_flush_error; - int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -102,18 +106,12 @@ struct btrfs_device { * size of the device on the current transaction * * This variant is update when committing the transaction, - * and protected by device_list_mutex + * and protected by chunk mutex */ u64 commit_total_bytes; /* bytes used on the current transaction */ u64 commit_bytes_used; - /* - * used to manage the device which is resized - * - * It is protected by chunk_lock. - */ - struct list_head resized_list; /* for sending down flush barriers */ struct bio *flush_bio; @@ -123,7 +121,6 @@ struct btrfs_device { struct scrub_ctx *scrub_ctx; struct btrfs_work work; - struct rcu_head rcu; /* readahead state */ atomic_t reada_in_flight; @@ -139,6 +136,8 @@ struct btrfs_device { /* Counter to record the change of device stats */ atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + + struct extent_io_tree alloc_state; }; /* @@ -233,21 +232,25 @@ struct btrfs_fs_devices { * this mutex lock. */ struct mutex device_list_mutex; + + /* List of all devices, protected by device_list_mutex */ struct list_head devices; - struct list_head resized_devices; - /* devices not currently being allocated */ + /* + * Devices which can satisfy space allocation. Protected by + * chunk_mutex + */ struct list_head alloc_list; struct btrfs_fs_devices *seed; - int seeding; + bool seeding; int opened; /* set when we find or add a device that doesn't have the * nonrot flag set */ - int rotating; + bool rotating; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -258,6 +261,15 @@ struct btrfs_fs_devices { #define BTRFS_BIO_INLINE_CSUM_SIZE 64 +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + /* * we need the mirror number and stripe index to be passed around * the call chain while we are processing end_io (especially errors). @@ -307,7 +319,6 @@ struct btrfs_bio { u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; - unsigned long flags; void *private; atomic_t error; int max_errors; @@ -332,16 +343,16 @@ struct btrfs_device_info { }; struct btrfs_raid_attr { - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int tolerated_failures; /* max tolerated fail devs */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to store + u8 sub_stripes; /* sub_stripes info for map */ + u8 dev_stripes; /* stripes per dev */ + u8 devs_max; /* max devs to use */ + u8 devs_min; /* min devs needed */ + u8 tolerated_failures; /* max tolerated fail devs */ + u8 devs_increment; /* ndevs has to be a multiple of this */ + u8 ncopies; /* how many copies to data has */ + u8 nparity; /* number of stripes worth of bytes to store * parity information */ - int mindev_error; /* error code if min devs requisite is unmet */ + u8 mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ }; @@ -390,6 +401,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); + /* fall through */ case REQ_OP_READ: return BTRFS_MAP_READ; } @@ -403,15 +415,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_init(struct btrfs_mapping_tree *tree); -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit); + int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -449,22 +462,16 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent_start(struct btrfs_transaction *transaction, - struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *max_avail); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, @@ -528,12 +535,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } -static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, - int index) -{ - btrfs_dev_stat_set(dev, index, 0); -} - /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. @@ -544,6 +545,10 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) + return BTRFS_RAID_RAID1C3; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) + return BTRFS_RAID_RAID1C4; else if (flags & BTRFS_BLOCK_GROUP_DUP) return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) @@ -556,18 +561,16 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -const char *get_raid_name(enum btrfs_raid_types type); - -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans); +void btrfs_commit_device_sizes(struct btrfs_transaction *trans); -struct list_head *btrfs_get_fs_uuids(void); +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); int btrfs_bg_type_to_factor(u64 flags); +const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f141b45ce349..95d9aebff2c4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -76,9 +76,8 @@ out: return ret; } -static int do_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -87,6 +86,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, size_t name_len = strlen(name); int ret = 0; + ASSERT(trans); + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; @@ -174,7 +175,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, char *ptr; if (size > old_data_len) { - if (btrfs_leaf_free_space(fs_info, leaf) < + if (btrfs_leaf_free_space(leaf) < (size - old_data_len)) { ret = -ENOSPC; goto out; @@ -184,17 +185,15 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(fs_info, path, - size - old_data_len); + btrfs_extend_item(path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(fs_info, path, - data_size, 1); + btrfs_truncate_item(path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(fs_info, path, data_size); + btrfs_extend_item(path, data_size); } item = btrfs_item_nr(slot); @@ -214,36 +213,32 @@ static int do_setxattr(struct btrfs_trans_handle *trans, } out: btrfs_free_path(path); + if (!ret) + set_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); return ret; } /* * @value: "" makes the attribute to empty, NULL removes it */ -int btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; - - if (trans) - return do_setxattr(trans, inode, name, value, size, flags); - trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = do_setxattr(trans, inode, name, value, size, flags); + ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: @@ -370,7 +365,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_setxattr(NULL, inode, name, buffer, size, flags); + return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, @@ -378,8 +373,30 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + int ret; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + name = xattr_full_name(handler, name); - return btrfs_set_prop(inode, name, value, size, flags); + ret = btrfs_validate_prop(name, value, size); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_set_prop(trans, inode, name, value, size, flags); + if (!ret) { + inode_inc_iversion(inode); + inode->i_ctime = current_time(inode); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + + btrfs_end_transaction(trans); + + return ret; } static const struct xattr_handler btrfs_security_xattr_handler = { @@ -419,10 +436,10 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { }; static int btrfs_initxattrs(struct inode *inode, - const struct xattr *xattr_array, void *fs_info) + const struct xattr *xattr_array, void *fs_private) { + struct btrfs_trans_handle *trans = fs_private; const struct xattr *xattr; - struct btrfs_trans_handle *trans = fs_info; unsigned int nofs_flag; char *name; int err = 0; @@ -442,7 +459,7 @@ static int btrfs_initxattrs(struct inode *inode, strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); err = btrfs_setxattr(trans, inode, name, xattr->value, - xattr->value_len, 0); + xattr->value_len, 0); kfree(name); if (err < 0) break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 471fcac6ff55..1cd3fc0a8f17 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -12,9 +12,10 @@ extern const struct xattr_handler *btrfs_xattr_handlers[]; int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -int btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags); +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags); +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags); ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b86b7ad6b900..a6c90a003c12 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -29,19 +29,9 @@ struct workspace { static struct workspace_manager wsm; -static void zlib_init_workspace_manager(void) +struct list_head *zlib_get_workspace(unsigned int level) { - btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress); -} - -static void zlib_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&wsm); -} - -static struct list_head *zlib_get_workspace(unsigned int level) -{ - struct list_head *ws = btrfs_get_workspace(&wsm, level); + struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -49,12 +39,7 @@ static struct list_head *zlib_get_workspace(unsigned int level) return ws; } -static void zlib_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&wsm, ws); -} - -static void zlib_free_workspace(struct list_head *ws) +void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -63,7 +48,7 @@ static void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zlib_alloc_workspace(unsigned int level) +struct list_head *zlib_alloc_workspace(unsigned int level) { struct workspace *workspace; int workspacesize; @@ -88,13 +73,9 @@ fail: return ERR_PTR(-ENOMEM); } -static int zlib_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; @@ -228,7 +209,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -319,10 +300,9 @@ done: return ret; } -static int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -418,23 +398,8 @@ next: return ret; } -static unsigned int zlib_set_level(unsigned int level) -{ - if (!level) - return BTRFS_ZLIB_DEFAULT_LEVEL; - - return min_t(unsigned int, level, 9); -} - const struct btrfs_compress_op btrfs_zlib_compress = { - .init_workspace_manager = zlib_init_workspace_manager, - .cleanup_workspace_manager = zlib_cleanup_workspace_manager, - .get_workspace = zlib_get_workspace, - .put_workspace = zlib_put_workspace, - .alloc_workspace = zlib_alloc_workspace, - .free_workspace = zlib_free_workspace, - .compress_pages = zlib_compress_pages, - .decompress_bio = zlib_decompress_bio, - .decompress = zlib_decompress, - .set_level = zlib_set_level, + .workspace_manager = &wsm, + .max_level = 9, + .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 6b9e29d050f3..9a4871636c6c 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> +#include "misc.h" #include "compression.h" #include "ctree.h" @@ -90,6 +91,8 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_alloc_workspace(unsigned int level); /* * zstd_reclaim_timer_fn - reclaim timer * @t: timer @@ -102,10 +105,10 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - spin_lock(&wsm.lock); + spin_lock_bh(&wsm.lock); if (list_empty(&wsm.lru_list)) { - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); return; } @@ -124,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) level = victim->level; list_del(&victim->lru_list); list_del(&victim->list); - wsm.ops->free_workspace(&victim->list); + zstd_free_workspace(&victim->list); if (list_empty(&wsm.idle_ws[level - 1])) clear_bit(level - 1, &wsm.active_map); @@ -134,7 +137,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); } /* @@ -164,7 +167,7 @@ static void zstd_calc_ws_mem_sizes(void) } } -static void zstd_init_workspace_manager(void) +void zstd_init_workspace_manager(void) { struct list_head *ws; int i; @@ -180,7 +183,7 @@ static void zstd_init_workspace_manager(void) for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&wsm.idle_ws[i]); - ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { pr_warn( "BTRFS: cannot preallocate zstd compression workspace\n"); @@ -190,22 +193,22 @@ static void zstd_init_workspace_manager(void) } } -static void zstd_cleanup_workspace_manager(void) +void zstd_cleanup_workspace_manager(void) { struct workspace *workspace; int i; - spin_lock(&wsm.lock); + spin_lock_bh(&wsm.lock); for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { while (!list_empty(&wsm.idle_ws[i])) { workspace = container_of(wsm.idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); - wsm.ops->free_workspace(&workspace->list); + zstd_free_workspace(&workspace->list); } } - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); del_timer_sync(&wsm.timer); } @@ -227,7 +230,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) struct workspace *workspace; int i = level - 1; - spin_lock(&wsm.lock); + spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { if (!list_empty(&wsm.idle_ws[i])) { ws = wsm.idle_ws[i].next; @@ -239,11 +242,11 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); return ws; } } - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); return NULL; } @@ -257,7 +260,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -static struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(unsigned int level) { struct list_head *ws; unsigned int nofs_flag; @@ -272,7 +275,7 @@ again: return ws; nofs_flag = memalloc_nofs_save(); - ws = wsm.ops->alloc_workspace(level); + ws = zstd_alloc_workspace(level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { @@ -298,11 +301,11 @@ again: * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -static void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct list_head *ws) { struct workspace *workspace = list_to_workspace(ws); - spin_lock(&wsm.lock); + spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ if (workspace->req_level == workspace->level) { @@ -322,13 +325,13 @@ static void zstd_put_workspace(struct list_head *ws) list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); workspace->req_level = 0; - spin_unlock(&wsm.lock); + spin_unlock_bh(&wsm.lock); if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) cond_wake_up(&wsm.wait); } -static void zstd_free_workspace(struct list_head *ws) +void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -337,7 +340,7 @@ static void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -363,13 +366,9 @@ fail: return ERR_PTR(-ENOMEM); } -static int zstd_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); ZSTD_CStream *stream; @@ -544,7 +543,7 @@ out: return ret; } -static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; @@ -622,10 +621,9 @@ done: return ret; } -static int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); ZSTD_DStream *stream; @@ -707,23 +705,9 @@ finish: return ret; } -static unsigned int zstd_set_level(unsigned int level) -{ - if (!level) - return ZSTD_BTRFS_DEFAULT_LEVEL; - - return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL); -} - const struct btrfs_compress_op btrfs_zstd_compress = { - .init_workspace_manager = zstd_init_workspace_manager, - .cleanup_workspace_manager = zstd_cleanup_workspace_manager, - .get_workspace = zstd_get_workspace, - .put_workspace = zstd_put_workspace, - .alloc_workspace = zstd_alloc_workspace, - .free_workspace = zstd_free_workspace, - .compress_pages = zstd_compress_pages, - .decompress_bio = zstd_decompress_bio, - .decompress = zstd_decompress, - .set_level = zstd_set_level, + /* ZSTD uses own workspace manager */ + .workspace_manager = NULL, + .max_level = ZSTD_BTRFS_MAX_LEVEL, + .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; |
