📄 rw.c
字号:
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * * Lustre Lite I/O page cache routines shared by different kernel revs * * Copyright (c) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#ifndef AUTOCONF_INCLUDED#include <linux/config.h>#endif#include <linux/kernel.h>#include <linux/mm.h>#include <linux/string.h>#include <linux/stat.h>#include <linux/errno.h>#include <linux/smp_lock.h>#include <linux/unistd.h>#include <linux/version.h>#include <asm/system.h>#include <asm/uaccess.h>#include <linux/fs.h>#include <linux/stat.h>#include <asm/uaccess.h>#include <linux/mm.h>#include <linux/pagemap.h>#include <linux/smp_lock.h>#define DEBUG_SUBSYSTEM S_LLITE#include <lustre_lite.h>#include "llite_internal.h"#include <linux/lustre_compat25.h>#ifndef list_for_each_prev_safe#define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; pos != (head); \ pos = n, n = pos->prev )#endifcfs_mem_cache_t *ll_async_page_slab = NULL;size_t ll_async_page_slab_size = 0;/* SYNCHRONOUS I/O to object storage for an inode */static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, struct page *page, int flags){ struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; struct obd_info oinfo = { { { 0 } } }; struct brw_page pg; int rc; ENTRY; pg.pg = page; pg.off = ((obd_off)page->index) << CFS_PAGE_SHIFT; if ((cmd & OBD_BRW_WRITE) && (pg.off+CFS_PAGE_SIZE>i_size_read(inode))) pg.count = i_size_read(inode) % CFS_PAGE_SIZE; else pg.count = CFS_PAGE_SIZE; LL_CDEBUG_PAGE(D_PAGE, page, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n", cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, inode->i_ino, pg.off, pg.off); if (pg.count == 0) { CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off " LPU64"\n", inode->i_ino, inode, i_size_read(inode), page->mapping->host, i_size_read(page->mapping->host), page->index, pg.off); } pg.flag = flags; if (cmd & OBD_BRW_WRITE) ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, pg.count); else ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, pg.count); oinfo.oi_oa = oa; oinfo.oi_md = lsm; rc = obd_brw(cmd, ll_i2obdexp(inode), &oinfo, 1, &pg, NULL); if (rc == 0) obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS); else if (rc != -EIO) CERROR("error from obd_brw: rc = %d\n", rc); RETURN(rc);}int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock){ struct ll_inode_info *lli = ll_i2info(inode); struct obd_info oinfo = { { { 0 } } }; struct obdo oa; int rc; ENTRY; CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n", lli->lli_smd->lsm_object_id, new_size, new_size); oinfo.oi_md = lli->lli_smd; oinfo.oi_policy.l_extent.start = new_size; oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; oinfo.oi_oa = &oa; oa.o_id = lli->lli_smd->lsm_object_id; oa.o_valid = OBD_MD_FLID; oa.o_flags = srvlock ? OBD_FL_TRUNCLOCK : 0; obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |OBD_MD_FLFID| OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER | OBD_MD_FLBLOCKS); rc = obd_punch_rqset(ll_i2obdexp(inode), &oinfo, NULL); if (rc) { CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino); RETURN(rc); } obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); RETURN(0);}/* this isn't where truncate starts. roughly: * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to * avoid races. * * must be called under ->lli_size_sem */void ll_truncate(struct inode *inode){ struct ll_inode_info *lli = ll_i2info(inode); int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags); loff_t new_size; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino, inode->i_generation, inode, i_size_read(inode), i_size_read(inode)); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1); if (lli->lli_size_sem_owner != current) { EXIT; return; } if (!lli->lli_smd) { CDEBUG(D_INODE, "truncate on inode %lu with no objects\n", inode->i_ino); GOTO(out_unlock, 0); } LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0); if (!srvlock) { struct ost_lvb lvb; int rc; /* XXX I'm pretty sure this is a hack to paper over a more fundamental * race condition. */ lov_stripe_lock(lli->lli_smd); inode_init_lvb(inode, &lvb); rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0); inode->i_blocks = lvb.lvb_blocks; if (lvb.lvb_size == i_size_read(inode) && rc == 0) { CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n", lli->lli_smd->lsm_object_id, i_size_read(inode), i_size_read(inode)); lov_stripe_unlock(lli->lli_smd); GOTO(out_unlock, 0); } obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd, i_size_read(inode), 1); lov_stripe_unlock(lli->lli_smd); } if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_LLITE_CHECKSUM) && (i_size_read(inode) & ~CFS_PAGE_MASK))) { /* If the truncate leaves a partial page, update its checksum */ struct page *page = find_get_page(inode->i_mapping, i_size_read(inode) >> CFS_PAGE_SHIFT); if (page != NULL) { struct ll_async_page *llap = llap_cast_private(page); if (llap != NULL) { char *kaddr = kmap_atomic(page, KM_USER0); llap->llap_checksum = init_checksum(OSC_DEFAULT_CKSUM); llap->llap_checksum = compute_checksum(llap->llap_checksum, kaddr, CFS_PAGE_SIZE, OSC_DEFAULT_CKSUM); kunmap_atomic(kaddr, KM_USER0); } page_cache_release(page); } } new_size = i_size_read(inode); ll_inode_size_unlock(inode, 0); if (!srvlock) ll_file_punch(inode, new_size, 0); EXIT; return; out_unlock: ll_inode_size_unlock(inode, 0);} /* ll_truncate */int ll_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to){ struct inode *inode = page->mapping->host; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; obd_off offset = ((obd_off)page->index) << CFS_PAGE_SHIFT; struct obd_info oinfo = { { { 0 } } }; struct brw_page pga; struct obdo oa; struct ost_lvb lvb; int rc = 0; ENTRY; LASSERT(PageLocked(page)); (void)llap_cast_private(page); /* assertion */ /* Check to see if we should return -EIO right away */ pga.pg = page; pga.off = offset; pga.count = CFS_PAGE_SIZE; pga.flag = 0; oa.o_mode = inode->i_mode; oa.o_id = lsm->lsm_object_id; oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); oinfo.oi_oa = &oa; oinfo.oi_md = lsm; rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oinfo, 1, &pga, NULL); if (rc) RETURN(rc); if (PageUptodate(page)) { LL_CDEBUG_PAGE(D_PAGE, page, "uptodate\n"); RETURN(0); } /* We're completely overwriting an existing page, so _don't_ set it up * to date until commit_write */ if (from == 0 && to == CFS_PAGE_SIZE) { LL_CDEBUG_PAGE(D_PAGE, page, "full page write\n"); POISON_PAGE(page, 0x11); RETURN(0); } /* If are writing to a new page, no need to read old data. The extent * locking will have updated the KMS, and for our purposes here we can * treat it like i_size. */ lov_stripe_lock(lsm); inode_init_lvb(inode, &lvb); obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1); lov_stripe_unlock(lsm); if (lvb.lvb_size <= offset) { char *kaddr = kmap_atomic(page, KM_USER0); LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n", lvb.lvb_size, offset); memset(kaddr, 0, CFS_PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); GOTO(prepare_done, rc = 0); } /* XXX could be an async ocp read.. read-ahead? */ rc = ll_brw(OBD_BRW_READ, inode, &oa, page, 0); if (rc == 0) { /* bug 1598: don't clobber blksize */ oa.o_valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLKSZ); obdo_refresh_inode(inode, &oa, oa.o_valid); } EXIT; prepare_done: if (rc == 0) SetPageUptodate(page); return rc;}static int ll_ap_make_ready(void *data, int cmd){ struct ll_async_page *llap; struct page *page; ENTRY; llap = LLAP_FROM_COOKIE(data); page = llap->llap_page; LASSERTF(!(cmd & OBD_BRW_READ), "cmd %x page %p ino %lu index %lu\n", cmd, page, page->mapping->host->i_ino, page->index); /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) RETURN(-EAGAIN); LASSERT(!PageWriteback(page)); /* if we left PageDirty we might get another writepage call * in the future. list walkers are bright enough * to check page dirty so we can leave it on whatever list * its on. XXX also, we're called with the cli list so if * we got the page cache list we'd create a lock inversion * with the removepage path which gets the page lock then the * cli lock */#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) clear_page_dirty(page);#else LASSERTF(!PageWriteback(page),"cmd %x page %p ino %lu index %lu\n", cmd, page, page->mapping->host->i_ino, page->index); clear_page_dirty_for_io(page); /* This actually clears the dirty bit in the radix tree.*/ set_page_writeback(page);#endif LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n"); page_cache_get(page); RETURN(0);}/* We have two reasons for giving llite the opportunity to change the * write length of a given queued page as it builds the RPC containing * the page: * * 1) Further extending writes may have landed in the page cache * since a partial write first queued this page requiring us * to write more from the page cache. (No further races are possible, since * by the time this is called, the page is locked.) * 2) We might have raced with truncate and want to avoid performing * write RPCs that are just going to be thrown away by the
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -