参照元†
返り値†
/*
* This is a library function for use by filesystem drivers.
*
* The locking rules are governed by the flags parameter:
* - if the flags value contains DIO_LOCKING we use a fancy locking
* scheme for dumb filesystems.
* For writes this function is called under i_mutex and returns with
* i_mutex held, for reads, i_mutex is not held on entry, but it is
* taken and dropped again before returning.
* For reads and writes i_alloc_sem is taken in shared mode and released
* on I/O completion (which may happen asynchronously after returning to
* the caller).
*
* - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize
* direct I/O reads/writes versus each other and truncate.
* For reads and writes both i_mutex and i_alloc_sem are not held on
* entry and are never taken.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
int flags)
{
int seg;
size_t size;
unsigned long addr;
unsigned blkbits = inode->i_blkbits;
unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
if (rw & WRITE)
rw = WRITE_ODIRECT_PLUG;
if (bdev)
bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
if (offset & blocksize_mask) {
if (bdev)
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
}
/* Check the memory alignment. Blocks cannot straddle pages */
for (seg = 0; seg < nr_segs; seg++) {
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
if ((addr & blocksize_mask) || (size & blocksize_mask)) {
if (bdev)
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
/*
* Believe it or not, zeroing out the page array caused a .5%
* performance regression in a database benchmark. So, we take
* care to only zero out what's needed.
*/
memset(dio, 0, offsetof(struct dio, pages));
dio->flags = flags;
if (dio->flags & DIO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
struct address_space *mapping =
iocb->ki_filp->f_mapping;
/* will be released by direct_io_worker */
mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
kfree(dio);
goto out;
}
}
/*
* Will be released at I/O completion, possibly in a
* different thread.
*/
down_read_non_owner(&inode->i_alloc_sem);
}
/*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
* even for AIO, we need to wait for i/o to complete before
* returning in this case.
*/
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
*
* NOTE: filesystems with their own locking have to handle this
* on their own.
*/
if (flags & DIO_LOCKING) {
if (unlikely((rw & WRITE) && retval < 0)) {
loff_t isize = i_size_read(inode);
if (end > isize)
vmtruncate(inode, isize);
}
}
out:
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
コメント†