📄 raid5-zerocopy.patch
字号:
diff -pru linux-2.6.9.orig/drivers/md/raid5.c linux-2.6.9/drivers/md/raid5.c--- linux-2.6.9.orig/drivers/md/raid5.c 2007-07-09 02:43:33.000000000 -0600+++ linux-2.6.9/drivers/md/raid5.c 2007-07-13 00:39:15.000000000 -0600@@ -412,6 +412,7 @@ static int raid5_end_read_request (struc clear_buffer_uptodate(bh); } #endif+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh);@@ -450,6 +451,10 @@ static int raid5_end_write_request (stru rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + if (test_bit(R5_Direct, &sh->dev[i].flags)) {+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;+ } clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); __release_stripe(conf, sh);@@ -621,6 +626,25 @@ static sector_t compute_blocknr(struct s } +static struct page *zero_copy_data(struct bio *bio, sector_t sector)+{+ sector_t bi_sector = bio->bi_sector;+ struct page *page;+ struct bio_vec *bvl;+ int i;++ bio_for_each_segment(bvl, bio, i) {+ if (sector > bi_sector) {+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;+ continue;+ }+ BUG_ON(sector != bi_sector);+ page = bio_iovec_idx(bio, i)->bv_page;+ return PageConstant(page) ? page : NULL;+ }+ BUG();+ return NULL;+} /* * Copy data between a page in the stripe cache, and one or more bion@@ -716,8 +740,9 @@ static void compute_parity(struct stripe { raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;- void *ptr[MAX_XOR_BLOCKS];+ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2]; struct bio *chosen;+ struct page *page; PRINTK("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method);@@ -744,13 +769,14 @@ static void compute_parity(struct stripe break; case RECONSTRUCT_WRITE: memset(ptr[0], 0, STRIPE_SIZE);- for (i= disks; i-- ;)+ for (i= disks; i-- ;) { if (i!=pd_idx && sh->dev[i].towrite) { chosen = sh->dev[i].towrite; sh->dev[i].towrite = NULL; if (sh->dev[i].written) BUG(); sh->dev[i].written = chosen; }+ } break; case CHECK_PARITY: break;@@ -760,34 +786,88 @@ static void compute_parity(struct stripe count = 1; } - for (i = disks; i--;)- if (sh->dev[i].written) {- sector_t sector = sh->dev[i].sector;- struct bio *wbi = sh->dev[i].written;- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {- copy_data(1, wbi, sh->dev[i].page, sector);- wbi = r5_next_bio(wbi, sector);+ for (i = disks; i--;) {+ struct bio *wbi = sh->dev[i].written;+ sector_t sector;++ if (!wbi)+ continue;++ sector = sh->dev[i].sector;+ set_bit(R5_LOCKED, &sh->dev[i].flags);+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));++ /* check if it's covered by a single page+ and whole stripe is written at once.+ * in this case we can avoid memcpy() */+ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&+ test_bit(R5_Insync, &sh->dev[i].flags)) {+ page = zero_copy_data(wbi, sector);+ if (page) {+ atomic_inc(&conf->writes_zcopy);+ sh->dev[i].req.bi_io_vec[0].bv_page = page;+ set_bit(R5_Direct, &sh->dev[i].flags);+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);+ continue; }+ } - set_bit(R5_LOCKED, &sh->dev[i].flags);- set_bit(R5_UPTODATE, &sh->dev[i].flags);+ atomic_inc(&conf->writes_copied);+ test_and_clear_bit(R5_OVERWRITE, &sh->dev[i].flags);+ set_bit(R5_UPTODATE, &sh->dev[i].flags);+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {+ copy_data(1, wbi, sh->dev[i].page, sector);+ wbi = r5_next_bio(wbi, sector); }+ } + h_ptr[0] = ptr[0]; switch(method) { case RECONSTRUCT_WRITE: case CHECK_PARITY:- for (i=disks; i--;)- if (i != pd_idx) {- ptr[count++] = page_address(sh->dev[i].page);- check_xor();+ for (i=disks; i--;) {+ if (i == pd_idx)+ continue;+ if (test_bit(R5_Direct, &sh->dev[i].flags))+ page = sh->dev[i].req.bi_io_vec[0].bv_page;+ else+ page = sh->dev[i].page;++ /* have to compute the parity immediately for+ * a highmem page. it would happen for zerocopy. -jay+ */+ if (PageHighMem(page)) {+ h_ptr[1] = kmap_atomic(page, KM_USER0);+ xor_block(2, STRIPE_SIZE, h_ptr);+ kunmap_atomic(page, KM_USER0);+ } else {+ ptr[count++] = page_address(page); }+ check_xor();+ } break; case READ_MODIFY_WRITE:- for (i = disks; i--;)- if (sh->dev[i].written) {- ptr[count++] = page_address(sh->dev[i].page);- check_xor();+ for (i = disks; i--;) {+ if (!sh->dev[i].written)+ continue;+ if (test_bit(R5_Direct, &sh->dev[i].flags))+ page = sh->dev[i].req.bi_io_vec[0].bv_page;+ else+ page = sh->dev[i].page;++ /* have to compute the parity immediately for+ * a highmem page. it would happen for zerocopy. -jay+ */+ if (PageHighMem(page)) {+ h_ptr[1] = kmap_atomic(page, KM_USER0);+ xor_block(2, STRIPE_SIZE, h_ptr);+ kunmap_atomic(page, KM_USER0);+ } else {+ ptr[count++] = page_address(page); }+ check_xor();+ } } if (count != 1) xor_block(count, STRIPE_SIZE, ptr);@@ -1059,13 +1139,15 @@ static void handle_stripe(struct stripe_ if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) &&- test_bit(R5_UPTODATE, &dev->flags) ) {+ (test_bit(R5_UPTODATE, &dev->flags) ||+ test_bit(R5_Direct, &dev->flags)) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL;+ test_and_clear_bit(R5_Direct, &dev->flags); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) {@@ -1831,6 +1913,7 @@ memory = conf->max_nr_stripes * (sizeof( if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; }+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE; /* Ok, everything is just fine now */ mddev->array_size = mddev->size * (mddev->raid_disks - 1);@@ -1918,9 +2001,11 @@ static void status (struct seq_file *seq atomic_read(&conf->handled_in_raid5d), atomic_read(&conf->out_of_stripes), atomic_read(&conf->handle_called));- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", atomic_read(&conf->reads_for_rmw),- atomic_read(&conf->reads_for_rcw));+ atomic_read(&conf->reads_for_rcw),+ atomic_read(&conf->writes_zcopy),+ atomic_read(&conf->writes_copied)); seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n", atomic_read(&conf->delayed), atomic_read(&conf->active_stripes),diff -pru linux-2.6.9.orig/include/linux/backing-dev.h linux-2.6.9/include/linux/backing-dev.h--- linux-2.6.9.orig/include/linux/backing-dev.h 2004-10-18 15:53:46.000000000 -0600+++ linux-2.6.9/include/linux/backing-dev.h 2007-07-13 00:12:46.000000000 -0600@@ -30,8 +30,11 @@ struct backing_dev_info { void *congested_data; /* Pointer to aux data for congested func */ void (*unplug_io_fn)(struct backing_dev_info *, struct page *); void *unplug_io_data;+ unsigned int capabilities; }; +#define BDI_CAP_PAGE_CONST_WRITE 0x00000001+ extern struct backing_dev_info default_backing_dev_info; void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page); @@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc (1 << BDI_write_congested)); } +#define mapping_cap_page_constant_write(mapping) \+ ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)+ #endif /* _LINUX_BACKING_DEV_H */diff -pru linux-2.6.9.orig/include/linux/page-flags.h linux-2.6.9/include/linux/page-flags.h--- linux-2.6.9.orig/include/linux/page-flags.h 2004-10-18 15:54:39.000000000 -0600+++ linux-2.6.9/include/linux/page-flags.h 2007-07-13 00:12:46.000000000 -0600@@ -74,6 +74,7 @@ #define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */+#define PG_constant 19 /* To mark the page is constant */ /*@@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u #define PageSwapCache(page) 0 #endif +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)+ struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page);diff -pru linux-2.6.9.orig/include/linux/pagemap.h linux-2.6.9/include/linux/pagemap.h--- linux-2.6.9.orig/include/linux/pagemap.h 2004-10-18 15:53:06.000000000 -0600+++ linux-2.6.9/include/linux/pagemap.h 2007-07-13 00:12:46.000000000 -0600@@ -191,6 +191,19 @@ static inline void wait_on_page_writebac extern void end_page_writeback(struct page *page); +extern int set_page_constant(struct page *page);+extern void clear_page_constant(struct page *);+static inline int set_page_constant_lock(struct page *page)+{+ BUG_ON(PageLocked(page));+ lock_page(page);+ if (set_page_constant(page)) {+ unlock_page(page);+ return 1;+ }+ return 0;+}+ /* * Fault a userspace page into pagetables. Return non-zero on a fault. *diff -pru linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h--- linux-2.6.9.orig/include/linux/raid/raid5.h 2007-07-09 02:43:33.000000000 -0600+++ linux-2.6.9/include/linux/raid/raid5.h 2007-07-13 00:39:15.000000000 -0600@@ -153,6 +153,7 @@ struct stripe_head { #define R5_Wantread 4 /* want to schedule a read */ #define R5_Wantwrite 5 #define R5_Syncio 6 /* this io need to be accounted as resync io */+#define R5_Direct 7 /* use page from passed bio to avoid memcpy */ /* * Write method@@ -234,6 +235,8 @@ struct raid5_private_data { atomic_t out_of_stripes; atomic_t reads_for_rmw; atomic_t reads_for_rcw;+ atomic_t writes_zcopy;+ atomic_t writes_copied; atomic_t handle_called; atomic_t delayed; atomic_t in_reqs_in_queue;diff -pru linux-2.6.9.orig/mm/filemap.c linux-2.6.9/mm/filemap.c--- linux-2.6.9.orig/mm/filemap.c 2007-07-09 02:43:33.000000000 -0600+++ linux-2.6.9/mm/filemap.c 2007-07-13 00:12:46.000000000 -0600@@ -27,6 +27,8 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/security.h>+#include <linux/rmap.h>+ /* * This is needed for the following functions: * - try_to_release_page@@ -486,11 +488,52 @@ void end_page_writeback(struct page *pag BUG(); smp_mb__after_clear_bit(); }+ clear_page_constant(page); wake_up_page(page); } EXPORT_SYMBOL(end_page_writeback); +/* Mark a page in bio to be constant, page must be locked */+int set_page_constant(struct page *page)+{+ BUG_ON(!PageLocked(page));++ /* If it's an anonymous page and haven't been added to swap cache, + * do it here.+ */+ if (PageAnon(page) && !PageSwapCache(page))+ return 1;++ BUG_ON(!PageUptodate(page));++ /* I have to clear page uptodate before trying to remove+ * it from user's page table because otherwise, the page may be+ * reinstalled by a page access which happens between try_to_unmap()+ * and ClearPageUptodate(). -jay+ */+ ClearPageUptodate(page);+ if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {+ SetPageUptodate(page);+ return 1;+ }+ SetPageConstant(page);+ return 0;+}++void clear_page_constant(struct page *page)+{+ if (PageConstant(page)) {+ BUG_ON(!PageLocked(page));+ BUG_ON(PageUptodate(page));+ ClearPageConstant(page);+ SetPageUptodate(page);+ unlock_page(page);+ }+}+EXPORT_SYMBOL(set_page_constant);+EXPORT_SYMBOL(clear_page_constant);+ /* * Get a lock on the page, assuming we need to sleep to get it. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -