Merge tag 'for-6.5/splice-2023-06-23' of git://git.kernel.dk/linux

Pull splice updates from Jens Axboe:
 "This kills off ITER_PIPE to avoid a race between truncate,
  iov_iter_revert() on the pipe and an as-yet incomplete DMA to a bio
  with unpinned/unref'ed pages from an O_DIRECT splice read. This causes
  memory corruption.

  Instead, we either use (a) filemap_splice_read(), which invokes the
  buffered file reading code and splices from the pagecache into the
  pipe; (b) copy_splice_read(), which bulk-allocates a buffer, reads
  into it and then pushes the filled pages into the pipe; or (c) handle
  it in filesystem-specific code.

  Summary:

   - Rename direct_splice_read() to copy_splice_read()

   - Simplify the calculations for the number of pages to be reclaimed
     in copy_splice_read()

   - Turn do_splice_to() into a helper, vfs_splice_read(), so that it
     can be used by overlayfs and coda to perform the checks on the
     lower fs

   - Make vfs_splice_read() jump to copy_splice_read() to handle
     direct-I/O and DAX

   - Provide shmem with its own splice_read to handle non-existent pages
     in the pagecache. We don't want a ->read_folio() as we don't want
     to populate holes, but filemap_get_pages() requires it

   - Provide overlayfs with its own splice_read to call down to a lower
     layer as overlayfs doesn't provide ->read_folio()

   - Provide coda with its own splice_read to call down to a lower layer
     as coda doesn't provide ->read_folio()

   - Direct ->splice_read to copy_splice_read() in tty, procfs, kernfs
     and random files as they just copy to the output buffer and don't
     splice pages

   - Provide wrappers for afs, ceph, ecryptfs, ext4, f2fs, nfs, ntfs3,
     ocfs2, orangefs, xfs and zonefs to do locking and/or revalidation

   - Make cifs use filemap_splice_read()

   - Replace pointers to generic_file_splice_read() with pointers to
     filemap_splice_read() as DIO and DAX are handled in the caller;
     filesystems can still provide their own alternate ->splice_read()
     op

   - Remove generic_file_splice_read()

   - Remove ITER_PIPE and its paraphernalia as generic_file_splice_read
     was the only user"

* tag 'for-6.5/splice-2023-06-23' of git://git.kernel.dk/linux: (31 commits)
  splice: kdoc for filemap_splice_read() and copy_splice_read()
  iov_iter: Kill ITER_PIPE
  splice: Remove generic_file_splice_read()
  splice: Use filemap_splice_read() instead of generic_file_splice_read()
  cifs: Use filemap_splice_read()
  trace: Convert trace/seq to use copy_splice_read()
  zonefs: Provide a splice-read wrapper
  xfs: Provide a splice-read wrapper
  orangefs: Provide a splice-read wrapper
  ocfs2: Provide a splice-read wrapper
  ntfs3: Provide a splice-read wrapper
  nfs: Provide a splice-read wrapper
  f2fs: Provide a splice-read wrapper
  ext4: Provide a splice-read wrapper
  ecryptfs: Provide a splice-read wrapper
  ceph: Provide a splice-read wrapper
  afs: Provide a splice-read wrapper
  9p: Add splice_read wrapper
  net: Make sock_splice_read() use copy_splice_read() by default
  tty, proc, kernfs, random: Use copy_splice_read()
  ...
This commit is contained in:
Linus Torvalds
2023-06-26 11:52:12 -07:00
68 changed files with 694 additions and 621 deletions

View File

@@ -2693,8 +2693,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
error = filemap_get_pages(iocb, iter->count, &fbatch,
iov_iter_is_pipe(iter));
error = filemap_get_pages(iocb, iter->count, &fbatch, false);
if (error < 0)
break;
@@ -2878,9 +2877,24 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
return spliced;
}
/*
* Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
* a pipe.
/**
* filemap_splice_read - Splice data from a file's pagecache into a pipe
* @in: The file to read from
* @ppos: Pointer to the file position to read from
* @pipe: The pipe to splice into
* @len: The amount to splice
* @flags: The SPLICE_F_* flags
*
* This function gets folios from a file's pagecache and splices them into the
* pipe. Readahead will be called as necessary to fill more folios. This may
* be used for blockdevs also.
*
* Return: On success, the number of bytes read will be returned and *@ppos
* will be updated if appropriate; 0 will be returned if there is no more data
* to be read; -EAGAIN will be returned if the pipe had no space, and some
* other negative error code will be returned on error. A short read may occur
* if the pipe has insufficient space, we reach the end of the data or we hit a
* hole.
*/
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
@@ -2893,6 +2907,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
bool writably_mapped;
int i, error = 0;
if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
return 0;
init_sync_kiocb(&iocb, in);
iocb.ki_pos = *ppos;
@@ -2906,7 +2923,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
do {
cond_resched();
if (*ppos >= i_size_read(file_inode(in)))
if (*ppos >= i_size_read(in->f_mapping->host))
break;
iocb.ki_pos = *ppos;
@@ -2922,7 +2939,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(file_inode(in));
isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
break;
end_offset = min_t(loff_t, isize, *ppos + len);

View File

@@ -2731,6 +2731,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
return true;
}
static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
}
static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
return false;
}
static const struct pipe_buf_operations zero_pipe_buf_ops = {
.release = zero_pipe_buf_release,
.try_steal = zero_pipe_buf_try_steal,
.get = zero_pipe_buf_get,
};
static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
loff_t fpos, size_t size)
{
size_t offset = fpos & ~PAGE_MASK;
size = min_t(size_t, size, PAGE_SIZE - offset);
if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
.ops = &zero_pipe_buf_ops,
.page = ZERO_PAGE(0),
.offset = offset,
.len = size,
};
pipe->head++;
}
return size;
}
static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
struct inode *inode = file_inode(in);
struct address_space *mapping = inode->i_mapping;
struct folio *folio = NULL;
size_t total_spliced = 0, used, npages, n, part;
loff_t isize;
int error = 0;
/* Work out how much data we can actually add into the pipe */
used = pipe_occupancy(pipe->head, pipe->tail);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
do {
if (*ppos >= i_size_read(inode))
break;
error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
if (folio) {
folio_unlock(folio);
if (folio_test_hwpoison(folio)) {
error = -EIO;
break;
}
}
/*
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(*ppos >= isize))
break;
part = min_t(loff_t, isize - *ppos, len);
if (folio) {
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
folio_mark_accessed(folio);
/*
* Ok, we have the page, and it's up-to-date, so we can
* now splice it into the pipe.
*/
n = splice_folio_into_pipe(pipe, folio, *ppos, part);
folio_put(folio);
folio = NULL;
} else {
n = splice_zeropage_into_pipe(pipe, *ppos, len);
}
if (!n)
break;
len -= n;
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
break;
cond_resched();
} while (len);
if (folio)
folio_put(folio);
file_accessed(in);
return total_spliced ? total_spliced : error;
}
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
@@ -3971,7 +4103,7 @@ static const struct file_operations shmem_file_operations = {
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif