aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-01-05 17:45:09 -0800
committerDarrick J. Wong <djwong@kernel.org>2021-01-15 17:40:44 -0800
commit4d8f99e0a4197987823f6df1681efd02e87c1b2a (patch)
tree0be60bd1f66b98283f0861aa9192c09b770f1b5e
parent182ed4eb067b7b081cb731498f19f51cd1476efb (diff)
downloadxfs-linux-4d8f99e0a4197987823f6df1681efd02e87c1b2a.tar.gz
xfs: add a ->xchg_file_range handler
Notice: this object is not reachable from any branch.
Add a function to handle file range exchange requests from the vfs. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Notice: this object is not reachable from any branch.
-rw-r--r--fs/xfs/xfs_file.c49
-rw-r--r--fs/xfs/xfs_inode.c13
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_trace.h4
-rw-r--r--fs/xfs/xfs_xchgrange.c331
-rw-r--r--fs/xfs/xfs_xchgrange.h11
6 files changed, 409 insertions, 0 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6fed82a60a75a..9a027755b7934 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_xchgrange.h"
#include <linux/falloc.h>
#include <linux/backing-dev.h>
@@ -1222,6 +1223,53 @@ use_generic:
}
STATIC int
+xfs_file_xchg_range(
+ struct file *file1,
+ struct file *file2,
+ struct file_xchg_range *fxr)
+{
+ struct inode *inode1 = file_inode(file1);
+ struct inode *inode2 = file_inode(file2);
+ struct xfs_inode *ip1 = XFS_I(inode1);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ struct xfs_mount *mp = ip1->i_mount;
+ unsigned int priv_flags = 0;
+ int ret;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Update cmtime if the fd/inode don't forbid it. */
+ if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
+ priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
+ if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
+ priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
+
+ /* Lock both files against IO */
+ ret = xfs_ilock2_io_mmap(ip1, ip2);
+ if (ret)
+ return ret;
+
+ /* Prepare and then exchange file contents. */
+ ret = xfs_xchg_range_prep(file1, file2, fxr);
+ if (ret)
+ goto out_unlock;
+
+ trace_xfs_file_xchg_range(ip1, fxr->file1_offset, fxr->length, ip2,
+ fxr->file2_offset);
+
+ ret = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
+ if (ret)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock2_io_mmap(ip1, ip2);
+ if (ret)
+ trace_xfs_file_xchg_range_error(ip2, ret, _RET_IP_);
+ return ret;
+}
+
+STATIC int
xfs_file_open(
struct inode *inode,
struct file *file)
@@ -1485,6 +1533,7 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.copy_file_range = xfs_file_copy_range,
.remap_file_range = xfs_file_remap_range,
+ .xchg_file_range = xfs_file_xchg_range,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 284219f56ca2c..2a08131ac67d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3813,3 +3813,16 @@ xfs_iunlock2_io_mmap(
if (!same_inode)
inode_unlock(VFS_I(ip1));
}
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+ struct xfs_inode *ip)
+{
+ unsigned int blocks = 1;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ blocks = ip->i_mount->m_sb.sb_rextsize;
+
+ return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 95fa03e998762..1da7a16e33feb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -489,5 +489,6 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f6f1ba90ebb37..84fb33376201c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3323,6 +3323,10 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
+/* swapext tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_file_xchg_range);
+DEFINE_INODE_ERROR_EVENT(xfs_file_xchg_range_error);
DEFINE_INODE_IREC_EVENT(xfs_swapext_extent1);
DEFINE_INODE_IREC_EVENT(xfs_swapext_extent2);
DEFINE_ITRUNC_EVENT(xfs_swapext_update_inode_size);
diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c
index 5e7098d5838ed..e53244cc3c071 100644
--- a/fs/xfs/xfs_xchgrange.c
+++ b/fs/xfs/xfs_xchgrange.c
@@ -13,8 +13,13 @@
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+#include "xfs_trace.h"
#include "xfs_swapext.h"
#include "xfs_xchgrange.h"
+#include "xfs_sb.h"
/* Lock (and optionally join) two inodes for a file range exchange. */
void
@@ -64,3 +69,329 @@ xfs_xchg_range_estimate(
xfs_xchg_range_iunlock(req->ip1, req->ip2);
return error;
}
+
+/* Prepare two files to have their data exchanged. */
+int
+xfs_xchg_range_prep(
+ struct file *file1,
+ struct file *file2,
+ struct file_xchg_range *fxr)
+{
+ struct xfs_inode *ip1 = XFS_I(file_inode(file1));
+ struct xfs_inode *ip2 = XFS_I(file_inode(file2));
+ int ret;
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
+ return -EINVAL;
+
+ /*
+ * The alignment checks in the VFS helpers cannot deal with allocation
+ * units that are not powers of 2. This can happen with the realtime
+ * volume if the extent size is set. Note that alignment checks are
+ * skipped if FULL_FILES is set.
+ */
+ if (!(fxr->flags & FILE_XCHG_RANGE_FULL_FILES) &&
+ !is_power_of_2(xfs_inode_alloc_unitsize(ip2)))
+ return -EOPNOTSUPP;
+
+ ret = generic_xchg_file_range_prep(file1, file2, fxr,
+ xfs_inode_alloc_unitsize(ip2));
+ if (ret)
+ return ret;
+
+ /* Attach dquots to both inodes before changing block maps. */
+ ret = xfs_qm_dqattach(ip2);
+ if (ret)
+ return ret;
+ ret = xfs_qm_dqattach(ip1);
+ if (ret)
+ return ret;
+
+ /* Flush the relevant ranges of both files. */
+ ret = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
+ if (ret)
+ return ret;
+ return xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
+}
+
+/* Make a particular type of quota reservation. */
+STATIC int
+xfs_xchg_range_reserve_quota_blocks(
+ struct xfs_trans **tpp,
+ const struct xfs_swapext_req *req,
+ xfs_filblks_t ip1_mapped,
+ xfs_filblks_t ip2_mapped,
+ unsigned int qmopts,
+ bool *quota_retry)
+{
+ int error;
+
+ /*
+ * For each file, compute the net gain in the number of blocks that
+ * will be mapped into that file and reserve that much quota. The
+ * quota counts must be able to absorb at least that much space.
+ */
+ if (ip2_mapped > ip1_mapped) {
+ error = xfs_trans_reserve_quota_nblks(tpp, req->ip1,
+ ip2_mapped - ip1_mapped, 0,
+ qmopts, quota_retry);
+ if (!(*tpp) && req->ip1 != req->ip2)
+ xfs_iunlock(req->ip2, XFS_ILOCK_EXCL);
+ if (error || *quota_retry)
+ return error;
+ }
+
+ if (ip1_mapped > ip2_mapped) {
+ error = xfs_trans_reserve_quota_nblks(tpp, req->ip2,
+ ip1_mapped - ip2_mapped, 0,
+ qmopts, quota_retry);
+ if (!(*tpp) && req->ip1 != req->ip2)
+ xfs_iunlock(req->ip1, XFS_ILOCK_EXCL);
+ if (error || *quota_retry)
+ return error;
+ }
+
+ /*
+ * For each file, forcibly reserve the gross gain in mapped blocks so
+ * that we don't trip over any quota block reservation assertions.
+ * We must reserve the gross gain because the quota code subtracts from
+ * bcount the number of blocks that we unmap; it does not add that
+ * quantity back to the quota block reservation.
+ */
+ error = xfs_trans_reserve_quota_nblks(tpp, req->ip1, ip1_mapped, 0,
+ XFS_QMOPT_FORCE_RES | qmopts, NULL);
+ if (error)
+ return error;
+
+ return xfs_trans_reserve_quota_nblks(tpp, req->ip2, ip2_mapped, 0,
+ XFS_QMOPT_FORCE_RES | qmopts, NULL);
+}
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same.
+ */
+STATIC int
+xfs_xchg_range_reserve_quota(
+ struct xfs_trans **tpp,
+ const struct xfs_swapext_req *req,
+ const struct xfs_swapext_res *res,
+ bool *quota_retry)
+{
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON((*tpp)->t_mountp) ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ error = xfs_xchg_range_reserve_quota_blocks(tpp, req, res->ip1_bcount,
+ res->ip2_bcount, XFS_QMOPT_RES_REGBLKS, quota_retry);
+ if (error)
+ return error;
+ return xfs_xchg_range_reserve_quota_blocks(tpp, req, res->ip1_rtbcount,
+ res->ip2_rtbcount, XFS_QMOPT_RES_RTBLKS, quota_retry);
+}
+
+/* Enable the atomic file extent swap feature in the primary superblock. */
+STATIC int
+xfs_add_atomic_swap(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hasatomicswap(&mp->m_sb))
+ return 0;
+
+ /*
+ * Atomic extent swapping is only supported on filesystems new enough
+ * to have reflink or rmap support enabled, and only if the filesystem
+ * isn't configured with realtime support.
+ */
+ if (!xfs_sb_version_canatomicswap(&mp->m_sb) ||
+ xfs_sb_version_hasrealtime(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ xfs_warn(mp,
+ "EXPERIMENTAL atomic file range swap feature added. Use at your own risk!");
+ return xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_ATOMIC_SWAP);
+}
+
+/* Exchange the contents of two files. */
+int
+xfs_xchg_range(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ const struct file_xchg_range *fxr,
+ unsigned int private_flags)
+{
+ struct xfs_swapext_req req = {
+ .ip1 = ip1,
+ .ip2 = ip2,
+ .whichfork = XFS_DATA_FORK,
+ };
+ struct xfs_swapext_res res;
+ struct xfs_mount *mp = ip1->i_mount;
+ struct xfs_trans *tp;
+ loff_t req_len;
+ unsigned int attempts = 4; /* XXX ugly hack */
+ bool quota_retry = false;
+ int error;
+
+ /* We don't support whole-fork swapping yet. */
+ if (!xfs_sb_version_canatomicswap(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (fxr->flags & FILE_XCHG_RANGE_TO_EOF)
+ req.flags |= XFS_SWAPEXT_SET_SIZES;
+ if (fxr->flags & FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
+ req.flags |= XFS_SWAPEXT_SKIP_FILE1_HOLES;
+
+ req.startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset);
+ req.startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset);
+
+ /*
+ * Round the request length up to the nearest fundamental unit of
+ * allocation. The prep function already checked that the request
+ * offsets and length in @fxr are safe to round up.
+ */
+ req_len = round_up(fxr->length, xfs_inode_alloc_unitsize(ip2));
+ req.blockcount = XFS_B_TO_FSB(mp, req_len);
+
+ /*
+ * Cancel CoW fork preallocations for the ranges of both files. The
+ * prep function should have flushed all the dirty data, so the only
+ * extents remaining should be speculative.
+ */
+ if (xfs_inode_has_cow_data(ip1)) {
+ error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ if (xfs_inode_has_cow_data(ip2)) {
+ error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ error = xfs_xchg_range_estimate(&req, &res);
+ if (error)
+ return error;
+
+ /*
+ * If the caller wanted atomic swap, make sure the feature bit is
+ * turned on and ready to go.
+ */
+ if (!(fxr->flags & FILE_XCHG_RANGE_NONATOMIC)) {
+ error = xfs_add_atomic_swap(mp);
+ if (error)
+ return error;
+ }
+
+retry:
+ /* Allocate the transaction, lock the inodes, and join them. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, res.resblks, 0,
+ XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+
+ xfs_xchg_range_ilock(tp, ip1, ip2);
+
+ trace_xfs_swap_extent_before(ip2, 0);
+ trace_xfs_swap_extent_before(ip1, 1);
+
+ /*
+ * Do all of the inputs checking that we can only do once we've taken
+ * both ILOCKs.
+ */
+ error = generic_xchg_file_range_check_fresh(VFS_I(ip1), VFS_I(ip2),
+ fxr);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_swapext_check_extents(mp, &req);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * Reserve ourselves some quota if any of them are in enforcing mode.
+ * In theory we only need enough to satisfy the change in the number
+ * of blocks between the two ranges being remapped.
+ */
+ error = xfs_xchg_range_reserve_quota(&tp, &req, &res, &quota_retry);
+ if (error)
+ return error;
+ if (quota_retry) {
+ if (--attempts == 0) {
+ xfs_emerg(mp, "OY! QUOTA RES FAILED 4x IN XCHGRANGE!");
+ return -EL3HLT;
+ }
+ goto retry;
+ }
+
+ /* If we got this far on a dry run, all parameters are ok. */
+ if (fxr->flags & FILE_XCHG_RANGE_DRY_RUN)
+ goto out_trans_cancel;
+
+ /* Update the mtime and ctime of both files. */
+ if (private_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
+ xfs_trans_ichgtime(tp, ip1,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (private_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
+ xfs_trans_ichgtime(tp, ip2,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /* Exchange the file contents by swapping the block mappings. */
+ error = xfs_swapext(&tp, &req);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * If the caller wanted us to exchange the contents of two complete
+ * files of unequal length, exchange the incore sizes now. This should
+ * be safe because we flushed both files' page caches and moved all the
+ * post-eof extents, so there should not be anything to zero.
+ */
+ if (fxr->flags & FILE_XCHG_RANGE_TO_EOF) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(ip2));
+ i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
+ i_size_write(VFS_I(ip1), temp);
+ }
+
+ /* Relog the inodes to keep transactions moving forward. */
+ xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+
+ /*
+ * Force the log to persist metadata updates if the caller or the
+ * administrator requires this. The VFS prep function already flushed
+ * the relevant parts of the page cache.
+ */
+ if ((mp->m_flags & XFS_MOUNT_WSYNC) ||
+ (fxr->flags & FILE_XCHG_RANGE_FSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp);
+
+ trace_xfs_swap_extent_after(ip2, 0);
+ trace_xfs_swap_extent_after(ip1, 1);
+
+out_unlock:
+ xfs_xchg_range_iunlock(ip1, ip2);
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
diff --git a/fs/xfs/xfs_xchgrange.h b/fs/xfs/xfs_xchgrange.h
index ddda2bfb6f4b9..cca2970346890 100644
--- a/fs/xfs/xfs_xchgrange.h
+++ b/fs/xfs/xfs_xchgrange.h
@@ -15,5 +15,16 @@ void xfs_xchg_range_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
int xfs_xchg_range_estimate(const struct xfs_swapext_req *req,
struct xfs_swapext_res *res);
+int xfs_xchg_range_prep(struct file *file1, struct file *file2,
+ struct file_xchg_range *fxr);
+
+/* Update ip1's change and mod time. */
+#define XFS_XCHG_RANGE_UPD_CMTIME1 (1 << 0)
+
+/* Update ip2's change and mod time. */
+#define XFS_XCHG_RANGE_UPD_CMTIME2 (1 << 1)
+
+int xfs_xchg_range(struct xfs_inode *ip1, struct xfs_inode *ip2,
+ const struct file_xchg_range *fxr, unsigned int private_flags);
#endif /* __XFS_XCHGRANGE_H__ */