📄 llite_mmap.c
字号:
node = ll_node_from_inode(inode, policy.l_extent.start, policy.l_extent.end, mode_from_vma(vma)); if (IS_ERR(node)) { CERROR("not enough mem for lock_tree_node!\n"); RETURN(-ENOMEM); } lt_insert(tree, node); if (vma->vm_end - addr >= count) break; count -= vma->vm_end - addr; addr = vma->vm_end; } RETURN(0);}/* FIXME: there is a pagefault race goes as follow (only 2.4): * 1. A user process on node A accesses a portion of a mapped file, * resulting in a page fault. The pagefault handler invokes the * ll_nopage function, which reads the page into memory. * 2. A user process on node B writes to the same portion of the file * (either via mmap or write()), that cause node A to cancel the * lock and truncate the page. * 3. Node A then executes the rest of do_no_page(), entering the * now-invalid page into the PTEs. * * Make the whole do_no_page as a hook to cover both the page cache * and page mapping installing with dlm lock would eliminate this race. * * In 2.6, the truncate_count of address_space can cover this race. */#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type)#elsestruct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int type /* unused */)#endif{ struct file *filp = vma->vm_file; struct ll_file_data *fd = LUSTRE_FPRIVATE(filp); struct inode *inode = filp->f_dentry->d_inode; struct lustre_handle lockh = { 0 }; ldlm_policy_data_t policy; ldlm_mode_t mode; struct page *page = NULL; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm; struct ost_lvb lvb; __u64 kms, old_mtime; unsigned long pgoff, size, rand_read, seq_read; int rc = 0; ENTRY; if (lli->lli_smd == NULL) { CERROR("No lsm on fault?\n"); RETURN(NULL); } ll_clear_file_contended(inode); /* start and end the lock on the first and last bytes in the page */ policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE); CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n", vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end); mode = mode_from_vma(vma); old_mtime = LTIME_S(inode->i_mtime); lsm = lli->lli_smd; rc = ll_extent_lock(fd, inode, lsm, mode, &policy, &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU); if (rc != 0) RETURN(NULL); if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime) CWARN("binary changed. inode %lu\n", inode->i_ino); lov_stripe_lock(lsm); inode_init_lvb(inode, &lvb); obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1); kms = lvb.lvb_size; pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff; size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; if (pgoff >= size) { lov_stripe_unlock(lsm); ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); } else { /* XXX change inode size without ll_inode_size_lock() held! * there is a race condition with truncate path. (see * ll_extent_lock) */ /* XXX i_size_write() is not used because it is not safe to * take the ll_inode_size_lock() due to a potential lock * inversion (bug 6077). And since it's not safe to use * i_size_write() without a covering mutex we do the * assignment directly. It is not critical that the * size be correct. */ /* NOTE: region is within kms and, hence, within real file size (A). * We need to increase i_size to cover the read region so that * generic_file_read() will do its job, but that doesn't mean * the kms size is _correct_, it is only the _minimum_ size. * If someone does a stat they will get the correct size which * will always be >= the kms value here. b=11081 */ if (i_size_read(inode) < kms) { inode->i_size = kms; CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n", inode->i_ino, i_size_read(inode)); } lov_stripe_unlock(lsm); } /* If mapping is writeable, adjust kms to cover this page, * but do not extend kms beyond actual file size. * policy.l_extent.end is set to the end of the page by policy_from_vma * bug 10919 */ lov_stripe_lock(lsm); if (mode == LCK_PW) obd_adjust_kms(ll_i2obdexp(inode), lsm, min_t(loff_t, policy.l_extent.end + 1, i_size_read(inode)), 0); lov_stripe_unlock(lsm); /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that * the kernel will not read other pages not covered by ldlm in * filemap_nopage. we do our readahead in ll_readpage. */ rand_read = vma->vm_flags & VM_RAND_READ; seq_read = vma->vm_flags & VM_SEQ_READ; vma->vm_flags &= ~ VM_SEQ_READ; vma->vm_flags |= VM_RAND_READ; page = filemap_nopage(vma, address, type); LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address, (long)type); vma->vm_flags &= ~VM_RAND_READ; vma->vm_flags |= (rand_read | seq_read); ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh); RETURN(page);}/* To avoid cancel the locks covering mmapped region for lock cache pressure, * we track the mapped vma count by lli_mmap_cnt. * ll_vm_open(): when first vma is linked, split locks from lru. * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru. * * XXX we don't check the if the region of vma/lock for performance. */static void ll_vm_open(struct vm_area_struct * vma){ struct inode *inode = vma->vm_file->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); ENTRY; LASSERT(vma->vm_file); spin_lock(&lli->lli_lock); LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0); atomic_inc(&lli->lli_mmap_cnt); if (atomic_read(&lli->lli_mmap_cnt) == 1) { struct lov_stripe_md *lsm = lli->lli_smd; struct ll_sb_info *sbi = ll_i2sbi(inode); int count; spin_unlock(&lli->lli_lock); if (!lsm) return; count = obd_join_lru(sbi->ll_osc_exp, lsm, 0); VMA_DEBUG(vma, "split %d unused locks from lru\n", count); } else { spin_unlock(&lli->lli_lock); }}static void ll_vm_close(struct vm_area_struct *vma){ struct inode *inode = vma->vm_file->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); ENTRY; LASSERT(vma->vm_file); spin_lock(&lli->lli_lock); LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0); atomic_dec(&lli->lli_mmap_cnt); if (atomic_read(&lli->lli_mmap_cnt) == 0) { struct lov_stripe_md *lsm = lli->lli_smd; struct ll_sb_info *sbi = ll_i2sbi(inode); int count; spin_unlock(&lli->lli_lock); if (!lsm) return; count = obd_join_lru(sbi->ll_osc_exp, lsm, 1); VMA_DEBUG(vma, "join %d unused locks to lru\n", count); } else { spin_unlock(&lli->lli_lock); }}#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))#ifndef HAVE_FILEMAP_POPULATEstatic int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);#endifstatic int ll_populate(struct vm_area_struct *area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock){ int rc = 0; ENTRY; /* always set nonblock as true to avoid page read ahead */ rc = filemap_populate(area, address, len, prot, pgoff, 1); RETURN(rc);}#endif/* return the user space pointer that maps to a file offset via a vma */static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte){ return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << CFS_PAGE_SHIFT));}#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))/* [first, last] are the byte offsets affected. * vm_{start, end} are user addresses of the first byte of the mapping and * the next byte beyond it * vm_pgoff is the page index of the first byte in the mapping */static void teardown_vmas(struct vm_area_struct *vma, __u64 first, __u64 last){ unsigned long address, len; for (; vma ; vma = vma->vm_next_share) { if (last >> CFS_PAGE_SHIFT < vma->vm_pgoff) continue; if (first >> CFS_PAGE_SHIFT >= (vma->vm_pgoff + ((vma->vm_end - vma->vm_start) >> CFS_PAGE_SHIFT))) continue; /* XXX in case of unmap the cow pages of a running file, * don't unmap these private writeable mapping here! * though that will break private mappping a little. * * the clean way is to check the mapping of every page * and just unmap the non-cow pages, just like * unmap_mapping_range() with even_cow=0 in kernel 2.6. */ if (!(vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) continue; address = max((unsigned long)vma->vm_start, file_to_user(vma, first)); len = min((unsigned long)vma->vm_end, file_to_user(vma, last) + 1) - address; VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" " "address=%ld len=%ld]\n", first, last, address, len); LASSERT(len > 0); ll_zap_page_range(vma, address, len); }}#endif/* XXX put nice comment here. talk about __free_pte -> dirty pages and * nopage's reference passing to the pte */int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last){ int rc = -ENOENT; ENTRY; LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) if (mapping_mapped(mapping)) { rc = 0; unmap_mapping_range(mapping, first + CFS_PAGE_SIZE - 1, last - first + 1, 0); }#else spin_lock(&mapping->i_shared_lock); if (mapping->i_mmap != NULL) { rc = 0; teardown_vmas(mapping->i_mmap, first, last); } if (mapping->i_mmap_shared != NULL) { rc = 0; teardown_vmas(mapping->i_mmap_shared, first, last); } spin_unlock(&mapping->i_shared_lock);#endif RETURN(rc);}static struct vm_operations_struct ll_file_vm_ops = { .nopage = ll_nopage, .open = ll_vm_open, .close = ll_vm_close,#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) .populate = ll_populate,#endif};int ll_file_mmap(struct file * file, struct vm_area_struct * vma){ int rc; ENTRY; ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1); rc = generic_file_mmap(file, vma); if (rc == 0) {#if !defined(HAVE_FILEMAP_POPULATE) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) if (!filemap_populate) filemap_populate = vma->vm_ops->populate;#endif vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ rc = ll_glimpse_size(file->f_dentry->d_inode, 0); } RETURN(rc);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -