📄 ext3-mballoc3-core.patch
字号:
Index: linux-2.6.9-full/include/linux/ext3_fs.h===================================================================--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-06-08 23:44:08.000000000 +0400+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-10-17 22:25:01.000000000 +0400@@ -57,6 +57,30 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR 1++#define EXT3_MB_HINT_MERGE 1 /* prefer goal again. length */+#define EXT3_MB_HINT_RESERVED 2 /* blocks already reserved */+#define EXT3_MB_HINT_METADATA 4 /* metadata is being allocated */+#define EXT3_MB_HINT_FIRST 8 /* first blocks in the file */+#define EXT3_MB_HINT_BEST 16 /* search for the best chunk */+#define EXT3_MB_HINT_DATA 32 /* data is being allocated */+#define EXT3_MB_HINT_NOPREALLOC 64 /* don't preallocate (for tails) */+#define EXT3_MB_HINT_GROUP_ALLOC 128 /* allocate for locality group */+#define EXT3_MB_HINT_GOAL_ONLY 256 /* allocate goal blocks or none */++struct ext3_allocation_request {+ struct inode *inode; /* target inode for block we're allocating */+ unsigned long logical; /* logical block in target inode */+ unsigned long goal; /* phys. target (a hint) */+ unsigned long lleft; /* the closest logical allocated block to the left */+ unsigned long pleft; /* phys. block for ^^^ */+ unsigned long lright; /* the closest logical allocated block to the right */+ unsigned long pright; /* phys. block for ^^^ */+ unsigned long len; /* how many blocks we want to allocate */+ unsigned long flags; /* flags. see above EXT3_MB_HINT_* */+};+ /* * Special inodes numbers */@@ -387,6 +411,14 @@ struct ext3_inode { #define ext3_find_first_zero_bit ext2_find_first_zero_bit #define ext3_find_next_zero_bit ext2_find_next_zero_bit +#ifndef ext2_find_next_le_bit+#ifdef __LITTLE_ENDIAN+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off))+#else+#error "mballoc needs a patch for big-endian systems - CFS bug 10634"+#endif /* __LITTLE_ENDIAN */+#endif /* !ext2_find_next_le_bit */+ /* * Maximal mount counts between two filesystem checks */@@ -763,6 +795,20 @@ extern unsigned long ext3_count_dirs (st extern void ext3_check_inodes_bitmap (struct super_block *); extern unsigned long ext3_count_free (struct buffer_head *, unsigned); +/* mballoc.c */+extern long ext3_mb_stats;+extern long ext3_mb_max_to_scan;+extern int ext3_mb_init(struct super_block *, int);+extern int ext3_mb_release(struct super_block *);+extern unsigned long ext3_mb_new_blocks(handle_t *, struct ext3_allocation_request *, int *);+extern int ext3_mb_reserve_blocks(struct super_block *, int);+extern void ext3_mb_release_blocks(struct super_block *, int);+extern void ext3_mb_release_blocks(struct super_block *, int);+extern void ext3_mb_discard_inode_preallocations(struct inode *);+extern int __init init_ext3_proc(void);+extern void exit_ext3_proc(void);+extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *);+ /* inode.c */ extern int ext3_block_truncate_page(handle_t *, struct page *,Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h===================================================================--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2007-06-08 23:44:07.000000000 +0400+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2007-10-17 22:25:01.000000000 +0400@@ -81,6 +81,61 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif++ /* for buddy allocator */+ struct ext3_group_info ***s_group_info;+ struct inode *s_buddy_cache;+ long s_blocks_reserved;+ spinlock_t s_reserve_lock;+ struct list_head s_active_transaction;+ struct list_head s_closed_transaction;+ struct list_head s_committed_transaction;+ spinlock_t s_md_lock;+ tid_t s_last_transaction;+ unsigned short *s_mb_offsets, *s_mb_maxs;++ /* tunables */+ unsigned long s_mb_factor;+ unsigned long s_stripe;+ unsigned long s_mb_stream_request;+ unsigned long s_mb_max_to_scan;+ unsigned long s_mb_min_to_scan;+ unsigned long s_mb_max_groups_to_scan;+ unsigned long s_mb_stats;+ unsigned long s_mb_order2_reqs;++ /* history to debug policy */+ struct ext3_mb_history *s_mb_history;+ int s_mb_history_cur;+ int s_mb_history_max;+ int s_mb_history_num;+ struct proc_dir_entry *s_mb_proc;+ spinlock_t s_mb_history_lock;+ int s_mb_history_filter;++ /* stats for buddy allocator */+ spinlock_t s_mb_pa_lock;+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */+ atomic_t s_bal_success; /* we found long enough chunks */+ atomic_t s_bal_allocated; /* in blocks */+ atomic_t s_bal_ex_scanned; /* total extents scanned */+ atomic_t s_bal_goals; /* goal hits */+ atomic_t s_bal_breaks; /* too long searches */+ atomic_t s_bal_2orders; /* 2^order hits */+ spinlock_t s_bal_lock;+ unsigned long s_mb_buddies_generated;+ unsigned long long s_mb_generation_time;+ atomic_t s_mb_lost_chunks;+ atomic_t s_mb_preallocated;+ atomic_t s_mb_discarded;++ /* locality groups */+ struct ext3_locality_group *s_locality_groups;+ }; +#define EXT3_GROUP_INFO(sb, group) \+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]+ #endif /* _LINUX_EXT3_FS_SB */Index: linux-2.6.9-full/fs/ext3/super.c===================================================================--- linux-2.6.9-full.orig/fs/ext3/super.c 2007-06-08 23:44:08.000000000 +0400+++ linux-2.6.9-full/fs/ext3/super.c 2007-10-17 22:26:27.000000000 +0400@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; + ext3_mb_release(sb); ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal);@@ -463,6 +464,8 @@ static struct inode *ext3_alloc_inode(st ei->vfs_inode.i_version = 1; memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));+ INIT_LIST_HEAD(&ei->i_prealloc_list);+ spin_lock_init(&ei->i_prealloc_lock); return &ei->vfs_inode; } @@ -2576,7 +2579,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) {- int err = init_ext3_xattr();+ int err;++ err = init_ext3_proc();+ if (err)+ return err;++ err = init_ext3_xattr(); if (err) return err; err = init_inodecache();@@ -2598,6 +2607,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr();+ exit_ext3_proc(); } int ext3_prep_san_write(struct inode *inode, long *blocks,Index: linux-2.6.9-full/fs/ext3/mballoc.c===================================================================--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2007-10-17 21:59:51.072534980 +0400+++ linux-2.6.9-full/fs/ext3/mballoc.c 2007-10-17 23:09:22.000000000 +0400@@ -0,0 +1,4404 @@+/*+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com+ * Written by Alex Tomas <alex@clusterfs.com>+ *+ * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License version 2 as+ * published by the Free Software Foundation.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public Licens+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-+ */+++/*+ * mballoc.c contains the multiblocks allocation routines+ */++#include <linux/time.h>+#include <linux/fs.h>+#include <linux/namei.h>+#include <linux/ext3_jbd.h>+#include <linux/jbd.h>+#include <linux/ext3_fs.h>+#include <linux/quotaops.h>+#include <linux/buffer_head.h>+#include <linux/module.h>+#include <linux/swap.h>+#include <linux/proc_fs.h>+#include <linux/pagemap.h>+#include <linux/seq_file.h>+#include <linux/version.h>++/*+ * MUSTDO:+ * - test ext3_ext_search_left() and ext3_ext_search_right()+ * - search for metadata in few groups+ *+ * TODO v4:+ * - normalization should take into account whether file is still open+ * - discard preallocations if no free space left (policy?)+ * - don't normalize tails+ * - quota+ * - reservation for superuser+ *+ * TODO v3:+ * - bitmap read-ahead (proposed by Oleg Drokin aka green)+ * - track min/max extents in each group for better group selection+ * - mb_mark_used() may allocate chunk right after splitting buddy+ * - tree of groups sorted by number of free blocks+ * - error handling+ */++/*+ * mballoc operates on the following data:+ * - on-disk bitmap+ * - in-core buddy (actually includes buddy and bitmap)+ * - preallocation descriptors (PAs)+ *+ * there are two types of preallocations:+ * - inode+ * assiged to specific inode and can be used for this inode only.+ * it describes part of inode's space preallocated to specific+ * physical blocks. any block from that preallocated can be used+ * independent. the descriptor just tracks number of blocks left+ * unused. so, before taking some block from descriptor, one must+ * make sure corresponded logical block isn't allocated yet. this+ * also means that freeing any block within descriptor's range+ * must discard all preallocated blocks.+ * - locality group+ * assigned to specific locality group which does not translate to+ * permanent set of inodes: inode can join and leave group. space+ * from this type of preallocation can be used for any inode. thus+ * it's consumed from the beginning to the end.+ *+ * relation between them can be expressed as:+ * in-core buddy = on-disk bitmap + preallocation descriptors+ *+ * this mean blocks mballoc considers used are:+ * - allocated blocks (persistent)+ * - preallocated blocks (non-persistent)+ *+ * consistency in mballoc world means that at any time a block is either+ * free or used in ALL structures. notice: "any time" should not be read+ * literally -- time is discrete and delimited by locks.+ *+ * to keep it simple, we don't use block numbers, instead we count number of+ * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.+ *+ * all operations can be expressed as:+ * - init buddy: buddy = on-disk + PAs+ * - new PA: buddy += N; PA = N+ * - use inode PA: on-disk += N; PA -= N+ * - discard inode PA buddy -= on-disk - PA; PA = 0+ * - use locality group PA on-disk += N; PA -= N+ * - discard locality group PA buddy -= PA; PA = 0+ * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap+ * is used in real operation because we can't know actual used+ * bits from PA, only from on-disk bitmap+ *+ * if we follow this strict logic, then all operations above should be atomic.+ * given some of them can block, we'd have to use something like semaphores+ * killing performance on high-end SMP hardware. let's try to relax it using+ * the following knowledge:+ * 1) if buddy is referenced, it's already initialized+ * 2) while block is used in buddy and the buddy is referenced,+ * nobody can re-allocate that block+ * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has+ * bit set and PA claims same block, it's OK. IOW, one can set bit in+ * on-disk bitmap if buddy has same bit set or/and PA covers corresponded+ * block+ *+ * so, now we're building a concurrency table:+ * - init buddy vs.+ * - new PA+ * blocks for PA are allocated in the buddy, buddy must be referenced+ * until PA is linked to allocation group to avoid concurrent buddy init+ * - use inode PA+ * we need to make sure that either on-disk bitmap or PA has uptodate data+ * given (3) we care that PA-=N operation doesn't interfere with init+ * - discard inode PA+ * the simplest way would be to have buddy initialized by the discard+ * - use locality group PA+ * again PA-=N must be serialized with init+ * - discard locality group PA+ * the simplest way would be to have buddy initialized by the discard+ * - new PA vs.+ * - use inode PA+ * i_truncate_mutex serializes them+ * - discard inode PA+ * discard process must wait until PA isn't used by another process+ * - use locality group PA+ * some mutex should serialize them+ * - discard locality group PA+ * discard process must wait until PA isn't used by another process+ * - use inode PA+ * - use inode PA+ * i_truncate_mutex or another mutex should serializes them+ * - discard inode PA+ * discard process must wait until PA isn't used by another process+ * - use locality group PA+ * nothing wrong here -- they're different PAs covering different blocks+ * - discard locality group PA+ * discard process must wait until PA isn't used by another process+ *+ * now we're ready to make few consequences:+ * - PA is referenced and while it is no discard is possible+ * - PA is referenced until block isn't marked in on-disk bitmap+ * - PA changes only after on-disk bitmap+ * - discard must not compete with init. either init is done before+ * any discard or they're serialized somehow+ * - buddy init as sum of on-disk bitmap and PAs is done atomically+ *+ * a special case when we've used PA to emptiness. no need to modify buddy+ * in this case, but we should care about concurrent init+ *+ */++ /*+ * Logic in few words:+ *+ * - allocation:+ * load group+ * find blocks+ * mark bits in on-disk bitmap+ * release group+ *+ * - use preallocation:+ * find proper PA (per-inode or group)+ * load group+ * mark bits in on-disk bitmap+ * release group+ * release PA+ *+ * - free:+ * load group+ * mark bits in on-disk bitmap+ * release group+ *+ * - discard preallocations in group:+ * mark PAs deleted+ * move them onto local list+ * load on-disk bitmap+ * load group+ * remove PA from object (inode or locality group)+ * mark free blocks in-core+ *+ * - discard inode's preallocations:+ */++/*+ * Locking rules+ *+ * Locks:+ * - bitlock on a group (group)+ * - object (inode/locality) (object)+ * - per-pa lock (pa)+ *+ * Paths:+ * - new pa+ * object+ * group+ *+ * - find and use pa:+ * pa+ *+ * - release consumed pa:+ * pa+ * group+ * object+ *+ * - generate in-core bitmap:+ * group+ * pa+ *+ * - discard all for given object (inode, locality group):+ * object+ * pa+ * group+ *+ * - discard all for given group:+ * group+ * pa+ * group+ * object+ *+ */++/*+ * with AGGRESSIVE_CHECK allocator runs consistency checks over+ * structures. these checks slow things down a lot+ */+#define AGGRESSIVE_CHECK__++/*+ * with DOUBLE_CHECK defined mballoc creates persistent in-core+ * bitmaps, maintains and uses them to check for double allocations+ */+#define DOUBLE_CHECK__++/*+ */+#define MB_DEBUG__+#ifdef MB_DEBUG+#define mb_debug(fmt,a...) printk(fmt, ##a)+#else+#define mb_debug(fmt,a...)+#endif++/*+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory+ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history+ */+#define EXT3_MB_HISTORY+#define EXT3_MB_HISTORY_ALLOC 1 /* allocation */+#define EXT3_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */+#define EXT3_MB_HISTORY_DISCARD 4 /* preallocation discarded */+#define EXT3_MB_HISTORY_FREE 8 /* free */++#define EXT3_MB_HISTORY_DEFAULT (EXT3_MB_HISTORY_ALLOC | \+ EXT3_MB_HISTORY_PREALLOC | \+ EXT3_MB_HISTORY_DISCARD | \+ EXT3_MB_HISTORY_FREE)++/*+ * How long mballoc can look for a best extent (in found extents)+ */+#define MB_DEFAULT_MAX_TO_SCAN 200++/*+ * How long mballoc must look for a best extent
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -