📄 llite_mmap.c

📁 lustre 1.6.5 source code
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
                node = ll_node_from_inode(inode, policy.l_extent.start,                                          policy.l_extent.end,                                          mode_from_vma(vma));                if (IS_ERR(node)) {                        CERROR("not enough mem for lock_tree_node!\n");                        RETURN(-ENOMEM);                }                lt_insert(tree, node);                if (vma->vm_end - addr >= count)                        break;                count -= vma->vm_end - addr;                addr = vma->vm_end;        }        RETURN(0);}/* FIXME: there is a pagefault race goes as follow (only 2.4): * 1. A user process on node A accesses a portion of a mapped file, *    resulting in a page fault.  The pagefault handler invokes the *    ll_nopage function, which reads the page into memory. * 2. A user process on node B writes to the same portion of the file *    (either via mmap or write()), that cause node A to cancel the *    lock and truncate the page. * 3. Node A then executes the rest of do_no_page(), entering the *    now-invalid page into the PTEs. * * Make the whole do_no_page as a hook to cover both the page cache * and page mapping installing with dlm lock would eliminate this race. * * In 2.6, the truncate_count of address_space can cover this race. */#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,                       int *type)#elsestruct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,                       int type /* unused */)#endif{        struct file *filp = vma->vm_file;        struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);        struct inode *inode = filp->f_dentry->d_inode;        struct lustre_handle lockh = { 0 };        ldlm_policy_data_t policy;        ldlm_mode_t mode;        struct page *page = NULL;        struct ll_inode_info *lli = ll_i2info(inode);        struct lov_stripe_md *lsm;        struct ost_lvb lvb;        __u64 kms, old_mtime;        unsigned long pgoff, size, rand_read, seq_read;        int rc = 0;        ENTRY;        if (lli->lli_smd == NULL) {                CERROR("No lsm on fault?\n");                RETURN(NULL);        }        ll_clear_file_contended(inode);        /* start and end the lock on the first and last bytes in the page */        policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE);        CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",               vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end);        mode = mode_from_vma(vma);        old_mtime = LTIME_S(inode->i_mtime);        lsm = lli->lli_smd;        rc = ll_extent_lock(fd, inode, lsm, mode, &policy,                            &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU);        if (rc != 0)                RETURN(NULL);        if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)                CWARN("binary changed. inode %lu\n", inode->i_ino);        lov_stripe_lock(lsm);        inode_init_lvb(inode, &lvb);        obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);        kms = lvb.lvb_size;        pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;        size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;        if (pgoff >= size) {                lov_stripe_unlock(lsm);                ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);        } else {                /* XXX change inode size without ll_inode_size_lock() held!                 *     there is a race condition with truncate path. (see                 *     ll_extent_lock) */                /* XXX i_size_write() is not used because it is not safe to                 *     take the ll_inode_size_lock() due to a potential lock                 *     inversion (bug 6077).  And since it's not safe to use                 *     i_size_write() without a covering mutex we do the                 *     assignment directly.  It is not critical that the                 *     size be correct. */                /* NOTE: region is within kms and, hence, within real file size (A).                 * We need to increase i_size to cover the read region so that                 * generic_file_read() will do its job, but that doesn't mean                 * the kms size is _correct_, it is only the _minimum_ size.                 * If someone does a stat they will get the correct size which                 * will always be >= the kms value here.  b=11081 */                if (i_size_read(inode) < kms) {                        inode->i_size = kms;                        CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n",                               inode->i_ino, i_size_read(inode));                }                lov_stripe_unlock(lsm);        }        /* If mapping is writeable, adjust kms to cover this page,         * but do not extend kms beyond actual file size.         * policy.l_extent.end is set to the end of the page by policy_from_vma         * bug 10919 */        lov_stripe_lock(lsm);        if (mode == LCK_PW)                obd_adjust_kms(ll_i2obdexp(inode), lsm,                               min_t(loff_t, policy.l_extent.end + 1,                               i_size_read(inode)), 0);        lov_stripe_unlock(lsm);        /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that         * the kernel will not read other pages not covered by ldlm in         * filemap_nopage. we do our readahead in ll_readpage.         */        rand_read = vma->vm_flags & VM_RAND_READ;        seq_read = vma->vm_flags & VM_SEQ_READ;        vma->vm_flags &= ~ VM_SEQ_READ;        vma->vm_flags |= VM_RAND_READ;        page = filemap_nopage(vma, address, type);        LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address,                       (long)type);        vma->vm_flags &= ~VM_RAND_READ;        vma->vm_flags |= (rand_read | seq_read);        ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);        RETURN(page);}/* To avoid cancel the locks covering mmapped region for lock cache pressure, * we track the mapped vma count by lli_mmap_cnt. * ll_vm_open():  when first vma is linked, split locks from lru. * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru. * * XXX we don't check the if the region of vma/lock for performance. */static void ll_vm_open(struct vm_area_struct * vma){        struct inode *inode = vma->vm_file->f_dentry->d_inode;        struct ll_inode_info *lli = ll_i2info(inode);        ENTRY;        LASSERT(vma->vm_file);        spin_lock(&lli->lli_lock);        LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0);        atomic_inc(&lli->lli_mmap_cnt);        if (atomic_read(&lli->lli_mmap_cnt) == 1) {                struct lov_stripe_md *lsm = lli->lli_smd;                struct ll_sb_info *sbi = ll_i2sbi(inode);                int count;                spin_unlock(&lli->lli_lock);                if (!lsm)                        return;                count = obd_join_lru(sbi->ll_osc_exp, lsm, 0);                VMA_DEBUG(vma, "split %d unused locks from lru\n", count);        } else {                spin_unlock(&lli->lli_lock);        }}static void ll_vm_close(struct vm_area_struct *vma){        struct inode *inode = vma->vm_file->f_dentry->d_inode;        struct ll_inode_info *lli = ll_i2info(inode);        ENTRY;        LASSERT(vma->vm_file);        spin_lock(&lli->lli_lock);        LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0);        atomic_dec(&lli->lli_mmap_cnt);        if (atomic_read(&lli->lli_mmap_cnt) == 0) {                struct lov_stripe_md *lsm = lli->lli_smd;                struct ll_sb_info *sbi = ll_i2sbi(inode);                int count;                spin_unlock(&lli->lli_lock);                if (!lsm)                        return;                count = obd_join_lru(sbi->ll_osc_exp, lsm, 1);                VMA_DEBUG(vma, "join %d unused locks to lru\n", count);        } else {                spin_unlock(&lli->lli_lock);        }}#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))#ifndef HAVE_FILEMAP_POPULATEstatic int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);#endifstatic int ll_populate(struct vm_area_struct *area, unsigned long address,                       unsigned long len, pgprot_t prot, unsigned long pgoff,                       int nonblock){        int rc = 0;        ENTRY;        /* always set nonblock as true to avoid page read ahead */        rc = filemap_populate(area, address, len, prot, pgoff, 1);        RETURN(rc);}#endif/* return the user space pointer that maps to a file offset via a vma */static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte){        return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << CFS_PAGE_SHIFT));}#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))/* [first, last] are the byte offsets affected. * vm_{start, end} are user addresses of the first byte of the mapping and *      the next byte beyond it * vm_pgoff is the page index of the first byte in the mapping */static void teardown_vmas(struct vm_area_struct *vma, __u64 first,                          __u64 last){        unsigned long address, len;        for (; vma ; vma = vma->vm_next_share) {                if (last >> CFS_PAGE_SHIFT < vma->vm_pgoff)                        continue;                if (first >> CFS_PAGE_SHIFT >= (vma->vm_pgoff +                    ((vma->vm_end - vma->vm_start) >> CFS_PAGE_SHIFT)))                        continue;                /* XXX in case of unmap the cow pages of a running file,                 * don't unmap these private writeable mapping here!                 * though that will break private mappping a little.                 *                 * the clean way is to check the mapping of every page                 * and just unmap the non-cow pages, just like                 * unmap_mapping_range() with even_cow=0 in kernel 2.6.                 */                if (!(vma->vm_flags & VM_SHARED) &&                    (vma->vm_flags & VM_WRITE))                        continue;                address = max((unsigned long)vma->vm_start,                              file_to_user(vma, first));                len = min((unsigned long)vma->vm_end,                          file_to_user(vma, last) + 1) - address;                VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" "                          "address=%ld len=%ld]\n", first, last, address, len);                LASSERT(len > 0);                ll_zap_page_range(vma, address, len);        }}#endif/* XXX put nice comment here.  talk about __free_pte -> dirty pages and * nopage's reference passing to the pte */int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last){        int rc = -ENOENT;        ENTRY;        LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))        if (mapping_mapped(mapping)) {                rc = 0;                unmap_mapping_range(mapping, first + CFS_PAGE_SIZE - 1,                                    last - first + 1, 0);        }#else        spin_lock(&mapping->i_shared_lock);        if (mapping->i_mmap != NULL) {                rc = 0;                teardown_vmas(mapping->i_mmap, first, last);        }        if (mapping->i_mmap_shared != NULL) {                rc = 0;                teardown_vmas(mapping->i_mmap_shared, first, last);        }        spin_unlock(&mapping->i_shared_lock);#endif        RETURN(rc);}static struct vm_operations_struct ll_file_vm_ops = {        .nopage         = ll_nopage,        .open           = ll_vm_open,        .close          = ll_vm_close,#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))        .populate       = ll_populate,#endif};int ll_file_mmap(struct file * file, struct vm_area_struct * vma){        int rc;        ENTRY;        ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1);        rc = generic_file_mmap(file, vma);        if (rc == 0) {#if !defined(HAVE_FILEMAP_POPULATE) && \    (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))                if (!filemap_populate)                        filemap_populate = vma->vm_ops->populate;#endif                vma->vm_ops = &ll_file_vm_ops;                vma->vm_ops->open(vma);                /* update the inode's size and mtime */                rc = ll_glimpse_size(file->f_dentry->d_inode, 0);        }        RETURN(rc);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -