Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull vfs dedup fixes from Dave Chinner:
 "This reworks the vfs data cloning infrastructure.

  We discovered many issues with these interfaces late in the 4.19 cycle
  - the worst of them (data corruption, setuid stripping) were fixed for
  XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
  the problems was needed. That rework is the contents of this pull
  request.

  Rework the vfs_clone_file_range and vfs_dedupe_file_range
  infrastructure to use a common .remap_file_range method and supply
  generic bounds and sanity checking functions that are shared with the
  data write path. The current VFS infrastructure has problems with
  rlimit, LFS file sizes, file time stamps, maximum filesystem file
  sizes, stripping setuid bits, etc and so they are addressed in these
  commits.

  We also introduce the ability for the ->remap_file_range methods to
  return short clones so that clones for vfs_copy_file_range() don't get
  rejected if the entire range can't be cloned. It also allows
  filesystems to sliently skip deduplication of partial EOF blocks if
  they are not capable of doing so without requiring errors to be thrown
  to userspace.

  Existing filesystems are converted to user the new remap_file_range
  method, and both XFS and ocfs2 are modified to make use of the new
  generic checking infrastructure"

* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
  xfs: remove [cm]time update from reflink calls
  xfs: remove xfs_reflink_remap_range
  xfs: remove redundant remap partial EOF block checks
  xfs: support returning partial reflink results
  xfs: clean up xfs_reflink_remap_blocks call site
  xfs: fix pagecache truncation prior to reflink
  ocfs2: remove ocfs2_reflink_remap_range
  ocfs2: support partial clone range and dedupe range
  ocfs2: fix pagecache truncation prior to reflink
  ocfs2: truncate page cache for clone destination file before remapping
  vfs: clean up generic_remap_file_range_prep return value
  vfs: hide file range comparison function
  vfs: enable remap callers that can handle short operations
  vfs: plumb remap flags through the vfs dedupe functions
  vfs: plumb remap flags through the vfs clone functions
  vfs: make remap_file_range functions take and return bytes completed
  vfs: remap helper should update destination inode metadata
  vfs: pass remap flags to generic_remap_checks
  vfs: pass remap flags to generic_remap_file_range_prep
  vfs: combine the clone and dedupe into a single remap_file_range
  ...
This commit is contained in:
Linus Torvalds
2018-11-02 09:33:08 -07:00
20 changed files with 749 additions and 611 deletions

View File

@@ -1587,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
if (file_in->f_op->clone_file_range) {
ret = file_in->f_op->clone_file_range(file_in, pos_in,
file_out, pos_out, len);
if (ret == 0) {
ret = len;
if (file_in->f_op->remap_file_range) {
loff_t cloned;
cloned = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
if (cloned > 0) {
ret = cloned;
goto done;
}
}
@@ -1685,11 +1689,12 @@ out2:
return ret;
}
static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
struct inode *inode = file_inode(file);
if (unlikely(pos < 0))
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0))
@@ -1707,178 +1712,44 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
/*
* Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the
* inodes have been locked against any other modifications.
* Ensure that we don't remap a partial EOF block in the middle of something
* else. Assume that the offsets have already been checked for block
* alignment.
*
* Returns: 0 for "nothing to clone", 1 for "something to clone", or
* the usual negative error code.
* For deduplication we always scale down to the previous block because we
* can't meaningfully compare post-EOF contents.
*
* For clone we only link a partial EOF block above the destination file's EOF.
*
* Shorten the request if possible.
*/
int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
struct inode *inode_out, loff_t pos_out,
u64 *len, bool is_dedupe)
static int generic_remap_check_len(struct inode *inode_in,
struct inode *inode_out,
loff_t pos_out,
loff_t *len,
unsigned int remap_flags)
{
loff_t bs = inode_out->i_sb->s_blocksize;
loff_t blen;
loff_t isize;
bool same_inode = (inode_in == inode_out);
int ret;
u64 blkmask = i_blocksize(inode_in) - 1;
loff_t new_len = *len;
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
return -ETXTBSY;
/* Don't reflink dirs, pipes, sockets... */
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
/* Are we going all the way to the end? */
isize = i_size_read(inode_in);
if (isize == 0)
if ((*len & blkmask) == 0)
return 0;
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
if (is_dedupe || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
if ((remap_flags & REMAP_FILE_DEDUP) ||
pos_out + *len < i_size_read(inode_out))
new_len &= ~blkmask;
if (new_len == *len)
return 0;
if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
*len = new_len;
return 0;
}
/* Ensure offsets don't wrap and the input is inside i_size */
if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
pos_in + *len > isize)
return -EINVAL;
/* Don't allow dedupe past EOF in the dest file */
if (is_dedupe) {
loff_t disize;
disize = i_size_read(inode_out);
if (pos_out >= disize || pos_out + *len > disize)
return -EINVAL;
}
/* If we're linking to EOF, continue to the block boundary. */
if (pos_in + *len == isize)
blen = ALIGN(isize, bs) - pos_in;
else
blen = *len;
/* Only reflink if we're aligned to block boundaries */
if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
!IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
return -EINVAL;
/* Don't allow overlapped reflink within the same file */
if (same_inode) {
if (pos_out + blen > pos_in && pos_out < pos_in + blen)
return -EINVAL;
}
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
if (!same_inode)
inode_dio_wait(inode_out);
ret = filemap_write_and_wait_range(inode_in->i_mapping,
pos_in, pos_in + *len - 1);
if (ret)
return ret;
ret = filemap_write_and_wait_range(inode_out->i_mapping,
pos_out, pos_out + *len - 1);
if (ret)
return ret;
/*
* Check that the extents are the same.
*/
if (is_dedupe) {
bool is_same = false;
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
inode_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
return -EBADE;
}
return 1;
return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
}
EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
int ret;
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
/*
* FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
* the same mount. Practically, they only need to be on the same file
* system.
*/
if (inode_in->i_sb != inode_out->i_sb)
return -EXDEV;
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
return -EBADF;
if (!file_in->f_op->clone_file_range)
return -EOPNOTSUPP;
ret = clone_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
ret = clone_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
if (pos_in + len > i_size_read(inode_in))
return -EINVAL;
ret = file_in->f_op->clone_file_range(file_in, pos_in,
file_out, pos_out, len);
if (!ret) {
fsnotify_access(file_in);
fsnotify_modify(file_out);
}
return ret;
}
EXPORT_SYMBOL(do_clone_file_range);
int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
{
int ret;
file_start_write(file_out);
ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
file_end_write(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
/*
* Read a page's worth of file data into the page cache. Return the page
@@ -1886,13 +1757,9 @@ EXPORT_SYMBOL(vfs_clone_file_range);
*/
static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
{
struct address_space *mapping;
struct page *page;
pgoff_t n;
n = offset >> PAGE_SHIFT;
mapping = inode->i_mapping;
page = read_mapping_page(mapping, n, NULL);
page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
if (IS_ERR(page))
return page;
if (!PageUptodate(page)) {
@@ -1907,9 +1774,9 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same)
static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same)
{
loff_t src_poff;
loff_t dest_poff;
@@ -1974,7 +1841,177 @@ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
out_error:
return error;
}
EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
/*
* Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the
* inodes have been locked against any other modifications.
*
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
return -ETXTBSY;
/* Don't reflink dirs, pipes, sockets... */
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
loff_t isize = i_size_read(inode_in);
if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
if (*len == 0)
return 0;
}
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
remap_flags);
if (ret)
return ret;
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
if (!same_inode)
inode_dio_wait(inode_out);
ret = filemap_write_and_wait_range(inode_in->i_mapping,
pos_in, pos_in + *len - 1);
if (ret)
return ret;
ret = filemap_write_and_wait_range(inode_out->i_mapping,
pos_out, pos_out + *len - 1);
if (ret)
return ret;
/*
* Check that the extents are the same.
*/
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
inode_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
return -EBADE;
}
ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
remap_flags);
if (ret)
return ret;
/* If can't alter the file contents, we're done. */
if (!(remap_flags & REMAP_FILE_DEDUP)) {
/* Update the timestamps, since we can alter file contents. */
if (!(file_out->f_mode & FMODE_NOCMTIME)) {
ret = file_update_time(file_out);
if (ret)
return ret;
}
/*
* Clear the security bits if the process is not being run by
* root. This keeps people from modifying setuid and setgid
* binaries.
*/
ret = file_remove_privs(file_out);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
loff_t ret;
WARN_ON_ONCE(remap_flags);
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
/*
* FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
* the same mount. Practically, they only need to be on the same file
* system.
*/
if (inode_in->i_sb != inode_out->i_sb)
return -EXDEV;
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
return -EBADF;
if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out, len, remap_flags);
if (ret < 0)
return ret;
fsnotify_access(file_in);
fsnotify_modify(file_out);
return ret;
}
EXPORT_SYMBOL(do_clone_file_range);
loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{
loff_t ret;
file_start_write(file_out);
ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
remap_flags);
file_end_write(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
/* Check whether we are allowed to dedupe the destination file */
static bool allow_file_dedupe(struct file *file)
@@ -1990,16 +2027,20 @@ static bool allow_file_dedupe(struct file *file)
return false;
}
int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos, u64 len)
loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags)
{
s64 ret;
loff_t ret;
WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
REMAP_FILE_CAN_SHORTEN));
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
ret = clone_verify_area(dst_file, dst_pos, len, true);
ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
@@ -2016,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EINVAL;
if (!dst_file->f_op->dedupe_file_range)
if (!dst_file->f_op->remap_file_range)
goto out_drop_write;
ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
dst_file, dst_pos, len);
if (len == 0) {
ret = 0;
goto out_drop_write;
}
ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
out_drop_write:
mnt_drop_write_file(dst_file);
@@ -2037,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
int i;
int ret;
u16 count = same->dest_count;
int deduped;
loff_t deduped;
if (!(file->f_mode & FMODE_READ))
return -EINVAL;
@@ -2056,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;
ret = clone_verify_area(file, off, len, false);
ret = remap_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;
@@ -2088,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
info->dest_offset, len);
info->dest_offset, len,
REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)