📄 aio.c
字号:
/* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * aio.c * * aio read and write * * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */#include <linux/fs.h>#include <linux/types.h>#include <linux/slab.h>#include <linux/highmem.h>#include <linux/pagemap.h>#include <linux/uio.h>#define MLOG_MASK_PREFIX ML_FILE_IO|ML_AIO#include <cluster/masklog.h>#include "ocfs2.h"#include "aio.h"#include "alloc.h"#include "dir.h"#include "dlmglue.h"#include "extent_map.h"#include "file.h"#include "sysfile.h"#include "inode.h"#include "mmap.h"#include "suballoc.h"struct ocfs2_kiocb_private { struct ocfs2_kiocb_private *kp_teardown_next; struct ocfs2_super *kp_osb; unsigned kp_have_alloc_sem:1, kp_have_write_locks:1; struct inode *kp_inode; struct ocfs2_buffer_lock_ctxt kp_ctxt; struct ocfs2_write_lock_info kp_info;};static void okp_teardown(struct ocfs2_kiocb_private *okp){ mlog(0, "okp %p\n", okp); BUG_ON(okp->kp_inode == NULL); if (okp->kp_info.wl_unlock_ctxt) ocfs2_unlock_buffer_inodes(&okp->kp_ctxt); if (okp->kp_have_alloc_sem) up_read(&OCFS2_I(okp->kp_inode)->ip_alloc_sem); iput(okp->kp_inode); kfree(okp);}void okp_teardown_from_list(void *data){ struct ocfs2_super *osb = data; struct ocfs2_kiocb_private *okp, *next; for (okp = xchg(&osb->osb_okp_teardown_next, NULL); okp != NULL; okp = next) { next = okp->kp_teardown_next; okp_teardown(okp); }}/* * This releases the dlm locks we held across an aio operation and frees the * space we were tracking them in. * * While aio operations are in flight they have a vfsmnt reference for the file * which prevents unmount. This dtor gets called *after* that ref is dropped, * however, so we have to make sure to account for pending work we have here in * the unmount path. The race starts when aio does its fputs, before it calls * dtor which queues work, so just synchronizing with the work queue could miss * that first phase. So unmount first waits for the pending count to drop. * Then it has to wait for keventd to finish the work freeing the okps. * * _dtor can be called from just about any context and lock teardown is * anything but interrupt safe. We used to hand the okps to * okp_teardown_from_list with a normal list_head and irq masking lock but we * want to avoid masking interrupts so it was shifted to the {cmp,}xchg() and * atomic_t. * * Adding to the singly linked ->next list is only a little tricky. We have to * watch for races between sampling the head to assign ->next in the inserting * okp and a new head being written before we point the head to the inserting * okp. */static void ocfs2_ki_dtor(struct kiocb *iocb){ struct ocfs2_kiocb_private *next, *okp = iocb->private; struct ocfs2_super *osb = okp->kp_osb; mlog(0, "iocb %p okp %p\n", iocb, okp); /* okp_alloc only assigns the iocb->private and ->ki_dtor pointers if * it was able to alloc the okp and get an inode reference */ BUG_ON(okp == NULL); BUG_ON(okp->kp_inode == NULL); /* we had better not try to work with this iocb again */ iocb->private = NULL; /* once this cmpxchg succeeds the okp can be freed so we have to be * careful not to deref it when testing success */ do { next = osb->osb_okp_teardown_next; okp->kp_teardown_next = next; } while (cmpxchg(&osb->osb_okp_teardown_next, next, okp) != next); schedule_work(&osb->osb_okp_teardown_work); if (atomic_dec_and_test(&osb->osb_okp_pending)) wake_up(&osb->osb_okp_pending_wq);}/* see ocfs2_ki_dtor() */void ocfs2_wait_for_okp_destruction(struct ocfs2_super *osb){ /* first wait for okps to enter the work queue */ wait_event(osb->osb_okp_pending_wq, atomic_read(&osb->osb_okp_pending) == 0); /* * then wait for keventd to finish with all its work, including ours. * * XXX this makes me very nervous. what if our work blocks keventd * during an unlock and the unlock can only proceed if keventd * can get to some more work that the dlm might have queued? * do we push any dlm work to keventd? */ flush_scheduled_work();}/* just to stop sys_io_cancel() from spewing to the console when it sees an * iocb without ki_cancel */static int ocfs2_ki_cancel(struct kiocb *iocb, struct io_event *ev){ mlog(0, "iocb %p\n", iocb); aio_put_req(iocb); return -EAGAIN;}static struct ocfs2_kiocb_private *okp_alloc(struct kiocb *iocb){ struct inode *inode = iocb->ki_filp->f_dentry->d_inode; struct ocfs2_kiocb_private *okp; struct ocfs2_super *osb; okp = kcalloc(1, sizeof(*okp), GFP_KERNEL); if (okp == NULL) { okp = ERR_PTR(-ENOMEM); goto out; } /* our dtor only gets registerd if we can guarantee that it holds * a reference to the inode */ okp->kp_inode = igrab(inode); if (okp->kp_inode == NULL) { kfree(okp); okp = ERR_PTR(-EINVAL); goto out; } /* unmount syncs with work using this ref before destroying the osb */ osb = OCFS2_SB(inode->i_sb); okp->kp_osb = osb; iocb->private = okp; iocb->ki_dtor = ocfs2_ki_dtor; iocb->ki_cancel = ocfs2_ki_cancel; INIT_BUFFER_LOCK_CTXT(&okp->kp_ctxt); atomic_inc(&osb->osb_okp_pending);out: mlog(0, "iocb %p returning %p\n", iocb, okp); return okp;}/* The DLM supports a minimal notion of AIO lock acquiry. Instead of testing * the iocb or current-> like kernel fs/block paths tend to, it takes an * explicit callback which it calls when a lock state attempt makes forward * progress. It would be better if it worked with the native * kernel AIO mechanics */static void ocfs2_aio_kick(int status, unsigned long data){ struct kiocb *iocb = (struct kiocb *)data; /* XXX worry about racing with ki_cancel once we set it */ mlog(0, "iocb %p\n", iocb); kick_iocb(iocb);}/* this is called as iocb->ki_retry so it is careful to only repeat * what is needed */ssize_t ocfs2_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos){ struct ocfs2_kiocb_private *okp = iocb->private; struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; struct ocfs2_backing_inode *target_binode; ssize_t ret, ret2; sigset_t blocked, oldset; /* * The DLM doesn't block waiting for network traffic or anything, it * modifies state and calls our callback when things have changed. * However, it still likes to check signals and return ERESTARTSYS. * The AIO core does not appreciate ERESTARTSYS as its semantics are * not exactly clear for submission, etc. So we block signals and * ensure that the DLM won't notice them. The caller, particularly * sys_io_getevents(), will eventually check signals before sleeping * and so things should still work as expected, if perhaps with * slightly higher signal delivery latency. */ sigfillset(&blocked); ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); if (ret < 0) { mlog_errno(ret); goto out; } mlog(0, "iocb %p okp %p\n", iocb, okp); if (okp == NULL) { okp = okp_alloc(iocb); if (IS_ERR(okp)) { ret = PTR_ERR(okp); mlog_errno(ret); goto setmask; } ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count, &okp->kp_ctxt, &target_binode); if (ret < 0) { mlog_errno(ret); goto setmask; } okp->kp_ctxt.b_cb = ocfs2_aio_kick; okp->kp_ctxt.b_cb_data = (unsigned long)iocb; target_binode->ba_lock_data_level = 0; } /* this might return EIOCBRETRY and we'll come back again to * continue the locking. It's harmless to call it once it has * returned success.. */ okp->kp_info.wl_unlock_ctxt = 1; /* re-use the write info path */ ret = ocfs2_lock_buffer_inodes(&okp->kp_ctxt, NULL); if (ret < 0) { if (ret != -EIOCBRETRY) mlog_errno(ret); goto setmask; } /* hold the ip_alloc_sem across the op */ if (!okp->kp_have_alloc_sem) { down_read(&OCFS2_I(inode)->ip_alloc_sem); okp->kp_have_alloc_sem = 1; } ret = generic_file_aio_read(iocb, buf, count, pos);setmask: ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL); if (ret2 < 0) { mlog_errno(ret2); if (ret == 0) ret = ret2; }out: /* ki_dtor will always be called eventually, no tear down here */ mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret); return ret;}/* this is called as iocb->ki_retry so it is careful to only repeat * what is needed */ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos){ struct ocfs2_kiocb_private *okp = iocb->private; struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; ssize_t ret = 0, ret2; sigset_t blocked, oldset; struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; /* explained up in ocfs2_file_aio_read() */ sigfillset(&blocked); ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); if (ret < 0) { mlog_errno(ret); goto out; } mlog(0, "iocb %p okp %p\n", iocb, okp); if (okp == NULL) { okp = okp_alloc(iocb); if (IS_ERR(okp)) { ret = PTR_ERR(okp); mlog_errno(ret); goto up_io; } okp->kp_ctxt.b_cb = ocfs2_aio_kick; okp->kp_ctxt.b_cb_data = (unsigned long)iocb; } if (!okp->kp_have_write_locks) { ret = ocfs2_write_lock_maybe_extend(filp, buf, count, &iocb->ki_pos, &okp->kp_info, &okp->kp_ctxt); /* * XXX this looks totally broken.. what if _maybe_extend * returns EIOCBRETRY? it'll never be called again and the * op will simply proceed without locking? */ okp->kp_have_write_locks = 1; if (okp->kp_info.wl_extended) { /* * this is not a particularly nice place to do this but * extending aio in ocfs2 is not yet a priority. it * means that we'll write zeros in the buffered case * before then over-writing them with the real op. It * also sleeps in the aio submission context. */ ocfs2_file_finish_extension(inode, okp->kp_info.wl_newsize, okp->kp_info.wl_do_direct_io); okp->kp_info.wl_extended = 0; } if (ret) { mlog_errno(ret); goto up_io; } } /* hold the ip_alloc_sem across the op */ if (!okp->kp_have_alloc_sem) { down_read(&OCFS2_I(inode)->ip_alloc_sem); okp->kp_have_alloc_sem = 1; }up_io: /* * never hold i_mutex when we leave this function, nor when we call * g_f_a_w(). we've done all extending and inode field updating under * the i_mutex and we hold the ip_alloc_sem for reading across the ops. * ocfs2_direct_IO calls blockdev_direct_IO with NO_LOCKING. */ if (okp->kp_info.wl_have_i_mutex) { mutex_unlock(&inode->i_mutex); okp->kp_info.wl_have_i_mutex = 0; } if (ret == 0) ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL); if (ret2 < 0) { mlog_errno(ret2); if (ret == 0) ret = ret2; }out: /* ki_dtor will always be called eventually, no tear down here */ mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret); return ret;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -