📄 mmap.c
字号:
binode->ba_meta_locked = 0; mlog_errno(status); goto bail; } binode->ba_data_locked = 1; } ocfs2_add_io_marker(inode, &binode->ba_task); } status = 0;bail: return status;}void ocfs2_unlock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt){ struct ocfs2_backing_inode *binode; struct rb_node *node; /* dlm locks don't mask ints.. this should be lower down */ BUG_ON(in_interrupt()); /* unlock in reverse order to minimize waking forward lockers */ while ((node = rb_last(&ctxt->b_inodes)) != NULL) { binode = rb_entry(node, struct ocfs2_backing_inode, ba_node); ocfs2_del_io_marker(binode->ba_inode, &binode->ba_task); if (binode->ba_data_locked) ocfs2_data_unlock(binode->ba_inode, binode->ba_lock_data_level); if (binode->ba_meta_locked) ocfs2_meta_unlock(binode->ba_inode, binode->ba_lock_meta_level); rb_erase(node, &ctxt->b_inodes); kfree(binode); } ctxt->b_next_unlocked = NULL;}static int ocfs2_write_remove_suid(struct inode *inode){ int ret; struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_journal_handle *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di; mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, inode->i_mode); handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); if (handle == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (ret < 0) { mlog_errno(ret); goto out_trans; } ocfs2_set_inode_lock_trans(osb->journal, inode); ret = ocfs2_journal_access(handle, inode, bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_bh; } inode->i_mode &= ~S_ISUID; if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) inode->i_mode &= ~S_ISGID; di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) mlog_errno(ret);out_bh: brelse(bh);out_trans: ocfs2_commit_trans(handle);out: mlog_exit(ret); return ret;}static inline int ocfs2_write_should_remove_suid(struct inode *inode){ mode_t mode = inode->i_mode; if (!capable(CAP_FSETID)) { if (unlikely(mode & S_ISUID)) return 1; if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) return 1; } return 0;}/* * This builds up the locking state that will be used by a write. both normal * file writes and AIO writes come in through here. This function does no * teardown on its own. The caller must examine the info struct to see if it * needs to release locks or i_mutex, etc. This function is also restartable in * that it can return EIOCBRETRY if it would have blocked in the dlm. It * stores its partial progress in the info struct so the caller can call back * in when it thinks the dlm won't block any more. Thus, the caller must zero * the info struct before calling in the first time. */ssize_t ocfs2_write_lock_maybe_extend(struct file *filp, const char __user *buf, size_t count, loff_t *ppos, struct ocfs2_write_lock_info *info, struct ocfs2_buffer_lock_ctxt *ctxt){ int ret = 0; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; struct ocfs2_super *osb = osb = OCFS2_SB(inode->i_sb); struct ocfs2_backing_inode *ba; int status; loff_t saved_ppos; u64 bytes_added = 0; /* * the target inode is different from the other inodes. in o_direct it * gets a PR data lock (see below) and when appending it gets an EX * meta lock. It's locked manually here though the backing_inode * fields are maintained while doing so so that unlock does the * right thing. */ if (info->wl_target_binode == NULL) { ret = ocfs2_setup_io_locks(inode->i_sb, inode, (char __user *) buf, count, ctxt, &info->wl_target_binode); if (ret < 0) { BUG_ON(ret == -EIOCBRETRY); mlog_errno(ret); goto bail; } } ba = info->wl_target_binode; /* This will lock everyone in the context who's order puts * them before us. */ if (!info->wl_have_before) { info->wl_unlock_ctxt = 1; ret = ocfs2_lock_buffer_inodes(ctxt, inode); if (ret < 0) { if (ret != -EIOCBRETRY) mlog_errno(ret); goto bail; } info->wl_have_before = 1; } if (!info->wl_have_i_mutex) { mutex_lock(&inode->i_mutex); info->wl_have_i_mutex = 1; } ba->ba_lock_data_level = 1; if (filp->f_flags & O_APPEND) ba->ba_lock_meta_level = 1;retry_meta_lock: if (!ba->ba_meta_locked) { status = ocfs2_meta_lock(inode, NULL, NULL, ba->ba_lock_meta_level); if (status < 0) { mlog_errno(status); ret = status; goto bail; } ba->ba_meta_locked = 1; } /* Clear suid / sgid if necessary. We do this here instead of * later in the write path because remove_suid() calls * ->setattr without any hint that we may have already done * our cluster locking. Since ocfs2_setattr() *must* take * cluster locks to proceeed, this will lead us to recursively * lock the inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we set * inode->i_size at the end of a write. */ if (ocfs2_write_should_remove_suid(inode)) { if (ba->ba_lock_meta_level == 0) { mlog(0, "inode %"MLFu64", had a PR, looping back for " "EX so we can remove SUID\n", OCFS2_I(inode)->ip_blkno); ocfs2_meta_unlock(inode, ba->ba_lock_meta_level); ba->ba_meta_locked = 0; ba->ba_lock_meta_level = 1; goto retry_meta_lock; } status = ocfs2_write_remove_suid(inode); if (status < 0) { mlog_errno(status); ret = status; goto bail; } } /* work on a copy of ppos until we're sure that we won't have * to recalculate it due to relocking. */ saved_ppos = *ppos; if (filp->f_flags & O_APPEND) { saved_ppos = i_size_read(inode); mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_ppos);#ifdef OCFS2_ORACORE_WORKAROUNDS if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { /* ugh, work around some applications which open * everything O_DIRECT + O_APPEND and really don't * mean to use O_DIRECT. */ filp->f_flags &= ~O_DIRECT; }#endif } if (filp->f_flags & O_DIRECT) {#ifdef OCFS2_ORACORE_WORKAROUNDS if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { int sector_size = 1 << osb->s_sectsize_bits; if ((saved_ppos & (sector_size - 1)) || (count & (sector_size - 1)) || ((unsigned long)buf & (sector_size - 1))) { info->wl_do_direct_io = 0; filp->f_flags |= O_SYNC; } else { info->wl_do_direct_io = 1; } } else#endif info->wl_do_direct_io = 1; mlog(0, "O_DIRECT\n"); } /* * We get PR data locks even for O_DIRECT. This allows concurrent * O_DIRECT writes but doesn't let O_DIRECT writes race with * extending and buffered zeroing writes race. If they did race * then the buffered zeroing could be written back after the O_DIRECT * write and overwrite it. It's one thing to tell people not to * mix buffered and O_DIRECT writes, but expecting them to understand * that file extension is also an implicit buffered write is * too much. By getting the PR we force writeback of the buffered * zeroing before proceeding. */ if (info->wl_do_direct_io && !(filp->f_flags & O_APPEND)) ba->ba_lock_data_level = 0; info->wl_newsize = count + saved_ppos; if (filp->f_flags & O_APPEND) info->wl_newsize = count + i_size_read(inode); /* get the locking straight for the extending case */ if (info->wl_newsize > i_size_read(inode)) { if (ba->ba_lock_meta_level == 0) { mlog(0, "inode %"MLFu64", had a PR meta, looping back " "for EX\n", OCFS2_I(inode)->ip_blkno); ocfs2_meta_unlock(inode, ba->ba_lock_meta_level); ba->ba_meta_locked = 0; ba->ba_lock_meta_level = 1; goto retry_meta_lock; } ba->ba_lock_data_level = 1; } /* * get the data lock before extending so that we can be sure * that we'll be able to zero under lock coverage. This does * get an EX data lock for O_DIRECT but as long as zeroing is * buffered we really must hold the lock while manipulating the * page cache. */ if (!ba->ba_data_locked) { status = ocfs2_data_lock(inode, ba->ba_lock_data_level); if (status < 0) { mlog_errno(status); ret = status; goto bail; } ba->ba_data_locked = 1; } mlog(0, "ppos=%lld newsize=%"MLFu64" cursize=%lld\n", saved_ppos, info->wl_newsize, i_size_read(inode)); if (info->wl_newsize > i_size_read(inode)) { mlog(0, "Writing at EOF, will need more allocation: " "i_size=%lld, need=%"MLFu64"\n", i_size_read(inode), info->wl_newsize); /* If we extend AT ALL here then we update our state * and continue the write call, regardless of error -- * this is basically a short write. */ status = ocfs2_extend_file(osb, inode, info->wl_newsize, &bytes_added); if (status < 0 && (!bytes_added)) { if (status != -ERESTARTSYS && status != -EINTR && status != -ENOSPC) { mlog_errno(status); mlog(ML_ERROR, "Failed to extend inode %"MLFu64 " from %lld to %"MLFu64, OCFS2_I(inode)->ip_blkno, *ppos, info->wl_newsize); } ret = status; goto bail; } info->wl_extended = 1; /* We need to recalulate newsize and count according * to what extend could give us. If we got the whole * extend then this doesn't wind up changing the * values. */ info->wl_newsize = i_size_read(inode) + bytes_added; count = info->wl_newsize - saved_ppos; if (status < 0 && status != -ENOSPC && status != -EINTR && status != -ERESTARTSYS) mlog(ML_ERROR, "status return of %d extending inode " "%"MLFu64"\n", status, OCFS2_I(inode)->ip_blkno); status = 0; } /* we've got whatever cluster lock is appropriate now, so we * can stuff *ppos back. */ *ppos = saved_ppos; /* This will lock everyone who's order puts them *after* our inode. */ ret = ocfs2_lock_buffer_inodes(ctxt, NULL); if (ret < 0) { if (ret != -EIOCBRETRY) mlog_errno(ret); goto bail; }bail: mlog_exit(ret); return ret;}#if 0static void ocfs2_buffer_ctxt_debug(struct ocfs2_buffer_lock_ctxt *ctxt){ struct ocfs2_backing_inode *binode; struct inode *inode; struct rb_node *node; printk("(%u) ocfs2: buffer lock ctxt: direct io = %d\n", current->pid, ctxt->b_lock_direct); node = rb_first(&ctxt->b_inodes); while (node) { binode = rb_entry(node, struct ocfs2_backing_inode, ba_node); inode = binode->ba_inode; printk("(%u) ocfs2: inode %llu, locked %d, is target? %s\n", current->pid, OCFS2_I(inode)->ip_blkno, binode->ba_locked, ocfs2_buffer_lock_is_target(ctxt, inode) ? "yes" : "no"); node = rb_next(node); }}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -