⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ext3-mballoc3-core.patch

📁 非常经典的一个分布式系统
💻 PATCH
📖 第 1 页 / 共 5 页
字号:
Index: linux-2.6.9-full/include/linux/ext3_fs.h===================================================================--- linux-2.6.9-full.orig/include/linux/ext3_fs.h	2007-06-08 23:44:08.000000000 +0400+++ linux-2.6.9-full/include/linux/ext3_fs.h	2007-10-17 22:25:01.000000000 +0400@@ -57,6 +57,30 @@ struct statfs; #define ext3_debug(f, a...)	do {} while (0) #endif +#define EXT3_MULTIBLOCK_ALLOCATOR	1++#define EXT3_MB_HINT_MERGE		1	/* prefer goal again. length */+#define EXT3_MB_HINT_RESERVED		2	/* blocks already reserved */+#define EXT3_MB_HINT_METADATA		4	/* metadata is being allocated */+#define EXT3_MB_HINT_FIRST		8	/* first blocks in the file */+#define EXT3_MB_HINT_BEST		16	/* search for the best chunk */+#define EXT3_MB_HINT_DATA		32	/* data is being allocated */+#define EXT3_MB_HINT_NOPREALLOC		64	/* don't preallocate (for tails) */+#define EXT3_MB_HINT_GROUP_ALLOC	128	/* allocate for locality group */+#define EXT3_MB_HINT_GOAL_ONLY		256	/* allocate goal blocks or none */++struct ext3_allocation_request {+	struct inode *inode;	/* target inode for block we're allocating */+	unsigned long logical;	/* logical block in target inode */+	unsigned long goal;	/* phys. target (a hint) */+	unsigned long lleft;	/* the closest logical allocated block to the left */+	unsigned long pleft;	/* phys. block for ^^^ */+	unsigned long lright;	/* the closest logical allocated block to the right */+	unsigned long pright;	/* phys. block for ^^^ */+	unsigned long len;	/* how many blocks we want to allocate */+	unsigned long flags;	/* flags. see above EXT3_MB_HINT_* */+};+ /*  * Special inodes numbers  */@@ -387,6 +411,14 @@ struct ext3_inode { #define ext3_find_first_zero_bit	ext2_find_first_zero_bit #define ext3_find_next_zero_bit		ext2_find_next_zero_bit +#ifndef ext2_find_next_le_bit+#ifdef __LITTLE_ENDIAN+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off))+#else+#error "mballoc needs a patch for big-endian systems - CFS bug 10634"+#endif	/* __LITTLE_ENDIAN */+#endif	/* !ext2_find_next_le_bit */+ /*  * Maximal mount counts between two filesystem checks  */@@ -763,6 +795,20 @@ extern unsigned long ext3_count_dirs (st extern void ext3_check_inodes_bitmap (struct super_block *); extern unsigned long ext3_count_free (struct buffer_head *, unsigned); +/* mballoc.c */+extern long ext3_mb_stats;+extern long ext3_mb_max_to_scan;+extern int ext3_mb_init(struct super_block *, int);+extern int ext3_mb_release(struct super_block *);+extern unsigned long ext3_mb_new_blocks(handle_t *, struct ext3_allocation_request *, int *);+extern int ext3_mb_reserve_blocks(struct super_block *, int);+extern void ext3_mb_release_blocks(struct super_block *, int);+extern void ext3_mb_release_blocks(struct super_block *, int);+extern void ext3_mb_discard_inode_preallocations(struct inode *);+extern int __init init_ext3_proc(void);+extern void exit_ext3_proc(void);+extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *);+  /* inode.c */ extern int ext3_block_truncate_page(handle_t *, struct page *,Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h===================================================================--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h	2007-06-08 23:44:07.000000000 +0400+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h	2007-10-17 22:25:01.000000000 +0400@@ -81,6 +81,61 @@ struct ext3_sb_info { 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */ 	int s_jquota_fmt;			/* Format of quota to use */ #endif++	/* for buddy allocator */+	struct ext3_group_info ***s_group_info;+	struct inode *s_buddy_cache;+	long s_blocks_reserved;+	spinlock_t s_reserve_lock;+	struct list_head s_active_transaction;+	struct list_head s_closed_transaction;+	struct list_head s_committed_transaction;+	spinlock_t s_md_lock;+	tid_t s_last_transaction;+	unsigned short *s_mb_offsets, *s_mb_maxs;++	/* tunables */+	unsigned long s_mb_factor;+	unsigned long s_stripe;+	unsigned long s_mb_stream_request;+	unsigned long s_mb_max_to_scan;+	unsigned long s_mb_min_to_scan;+	unsigned long s_mb_max_groups_to_scan;+	unsigned long s_mb_stats;+	unsigned long s_mb_order2_reqs;++	/* history to debug policy */+	struct ext3_mb_history *s_mb_history;+	int s_mb_history_cur;+	int s_mb_history_max;+	int s_mb_history_num;+	struct proc_dir_entry *s_mb_proc;+	spinlock_t s_mb_history_lock;+	int s_mb_history_filter;++	/* stats for buddy allocator */+	spinlock_t s_mb_pa_lock;+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */+	atomic_t s_bal_success;	/* we found long enough chunks */+	atomic_t s_bal_allocated;	/* in blocks */+	atomic_t s_bal_ex_scanned;	/* total extents scanned */+	atomic_t s_bal_goals;	/* goal hits */+	atomic_t s_bal_breaks;	/* too long searches */+	atomic_t s_bal_2orders;	/* 2^order hits */+	spinlock_t s_bal_lock;+	unsigned long s_mb_buddies_generated;+	unsigned long long s_mb_generation_time;+	atomic_t s_mb_lost_chunks;+	atomic_t s_mb_preallocated;+	atomic_t s_mb_discarded;++	/* locality groups */+	struct ext3_locality_group *s_locality_groups;+ }; +#define EXT3_GROUP_INFO(sb, group)					   \+	EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \+				 [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)]+ #endif	/* _LINUX_EXT3_FS_SB */Index: linux-2.6.9-full/fs/ext3/super.c===================================================================--- linux-2.6.9-full.orig/fs/ext3/super.c	2007-06-08 23:44:08.000000000 +0400+++ linux-2.6.9-full/fs/ext3/super.c	2007-10-17 22:26:27.000000000 +0400@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block  	struct ext3_super_block *es = sbi->s_es; 	int i; +	ext3_mb_release(sb); 	ext3_ext_release(sb); 	ext3_xattr_put_super(sb); 	journal_destroy(sbi->s_journal);@@ -463,6 +464,8 @@ static struct inode *ext3_alloc_inode(st 	ei->vfs_inode.i_version = 1;  	memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));+	INIT_LIST_HEAD(&ei->i_prealloc_list);+	spin_lock_init(&ei->i_prealloc_lock); 	return &ei->vfs_inode; } @@ -2576,7 +2579,13 @@ static struct file_system_type ext3_fs_t  static int __init init_ext3_fs(void) {-	int err = init_ext3_xattr();+	int err;++	err = init_ext3_proc();+	if (err)+		return err;++	err = init_ext3_xattr(); 	if (err) 		return err; 	err = init_inodecache();@@ -2598,6 +2607,7 @@ static void __exit exit_ext3_fs(void) 	unregister_filesystem(&ext3_fs_type); 	destroy_inodecache(); 	exit_ext3_xattr();+	exit_ext3_proc(); }  int ext3_prep_san_write(struct inode *inode, long *blocks,Index: linux-2.6.9-full/fs/ext3/mballoc.c===================================================================--- linux-2.6.9-full.orig/fs/ext3/mballoc.c	2007-10-17 21:59:51.072534980 +0400+++ linux-2.6.9-full/fs/ext3/mballoc.c	2007-10-17 23:09:22.000000000 +0400@@ -0,0 +1,4404 @@+/*+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com+ * Written by Alex Tomas <alex@clusterfs.com>+ *+ * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License version 2 as+ * published by the Free Software Foundation.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public Licens+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-+ */+++/*+ * mballoc.c contains the multiblocks allocation routines+ */++#include <linux/time.h>+#include <linux/fs.h>+#include <linux/namei.h>+#include <linux/ext3_jbd.h>+#include <linux/jbd.h>+#include <linux/ext3_fs.h>+#include <linux/quotaops.h>+#include <linux/buffer_head.h>+#include <linux/module.h>+#include <linux/swap.h>+#include <linux/proc_fs.h>+#include <linux/pagemap.h>+#include <linux/seq_file.h>+#include <linux/version.h>++/*+ * MUSTDO:+ *   - test ext3_ext_search_left() and ext3_ext_search_right()+ *   - search for metadata in few groups+ *+ * TODO v4:+ *   - normalization should take into account whether file is still open+ *   - discard preallocations if no free space left (policy?)+ *   - don't normalize tails+ *   - quota+ *   - reservation for superuser+ *+ * TODO v3:+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)+ *   - track min/max extents in each group for better group selection+ *   - mb_mark_used() may allocate chunk right after splitting buddy+ *   - tree of groups sorted by number of free blocks+ *   - error handling+ */++/*+ * mballoc operates on the following data:+ *  - on-disk bitmap+ *  - in-core buddy (actually includes buddy and bitmap)+ *  - preallocation descriptors (PAs)+ *+ * there are two types of preallocations:+ *  - inode+ *    assiged to specific inode and can be used for this inode only.+ *    it describes part of inode's space preallocated to specific+ *    physical blocks. any block from that preallocated can be used+ *    independent. the descriptor just tracks number of blocks left+ *    unused. so, before taking some block from descriptor, one must+ *    make sure corresponded logical block isn't allocated yet. this+ *    also means that freeing any block within descriptor's range+ *    must discard all preallocated blocks.+ *  - locality group+ *    assigned to specific locality group which does not translate to+ *    permanent set of inodes: inode can join and leave group. space+ *    from this type of preallocation can be used for any inode. thus+ *    it's consumed from the beginning to the end.+ *+ * relation between them can be expressed as:+ *    in-core buddy = on-disk bitmap + preallocation descriptors+ *+ * this mean blocks mballoc considers used are:+ *  - allocated blocks (persistent)+ *  - preallocated blocks (non-persistent)+ *+ * consistency in mballoc world means that at any time a block is either+ * free or used in ALL structures. notice: "any time" should not be read+ * literally -- time is discrete and delimited by locks.+ *+ *  to keep it simple, we don't use block numbers, instead we count number of+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.+ *+ * all operations can be expressed as:+ *  - init buddy:			buddy = on-disk + PAs+ *  - new PA:				buddy += N; PA = N+ *  - use inode PA:			on-disk += N; PA -= N+ *  - discard inode PA			buddy -= on-disk - PA; PA = 0+ *  - use locality group PA		on-disk += N; PA -= N+ *  - discard locality group PA		buddy -= PA; PA = 0+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap+ *        is used in real operation because we can't know actual used+ *        bits from PA, only from on-disk bitmap+ *+ * if we follow this strict logic, then all operations above should be atomic.+ * given some of them can block, we'd have to use something like semaphores+ * killing performance on high-end SMP hardware. let's try to relax it using+ * the following knowledge:+ *  1) if buddy is referenced, it's already initialized+ *  2) while block is used in buddy and the buddy is referenced,+ *     nobody can re-allocate that block+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded+ *     block+ *+ * so, now we're building a concurrency table:+ *  - init buddy vs.+ *    - new PA+ *      blocks for PA are allocated in the buddy, buddy must be referenced+ *      until PA is linked to allocation group to avoid concurrent buddy init+ *    - use inode PA+ *      we need to make sure that either on-disk bitmap or PA has uptodate data+ *      given (3) we care that PA-=N operation doesn't interfere with init+ *    - discard inode PA+ *      the simplest way would be to have buddy initialized by the discard+ *    - use locality group PA+ *      again PA-=N must be serialized with init+ *    - discard locality group PA+ *      the simplest way would be to have buddy initialized by the discard+ *  - new PA vs.+ *    - use inode PA+ *      i_truncate_mutex serializes them+ *    - discard inode PA+ *      discard process must wait until PA isn't used by another process+ *    - use locality group PA+ *      some mutex should serialize them+ *    - discard locality group PA+ *      discard process must wait until PA isn't used by another process+ *  - use inode PA+ *    - use inode PA+ *      i_truncate_mutex or another mutex should serializes them+ *    - discard inode PA+ *      discard process must wait until PA isn't used by another process+ *    - use locality group PA+ *      nothing wrong here -- they're different PAs covering different blocks+ *    - discard locality group PA+ *      discard process must wait until PA isn't used by another process+ *+ * now we're ready to make few consequences:+ *  - PA is referenced and while it is no discard is possible+ *  - PA is referenced until block isn't marked in on-disk bitmap+ *  - PA changes only after on-disk bitmap+ *  - discard must not compete with init. either init is done before+ *    any discard or they're serialized somehow+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically+ *+ * a special case when we've used PA to emptiness. no need to modify buddy+ * in this case, but we should care about concurrent init+ *+ */++ /*+ * Logic in few words:+ *+ *  - allocation:+ *    load group+ *    find blocks+ *    mark bits in on-disk bitmap+ *    release group+ *+ *  - use preallocation:+ *    find proper PA (per-inode or group)+ *    load group+ *    mark bits in on-disk bitmap+ *    release group+ *    release PA+ *+ *  - free:+ *    load group+ *    mark bits in on-disk bitmap+ *    release group+ *+ *  - discard preallocations in group:+ *    mark PAs deleted+ *    move them onto local list+ *    load on-disk bitmap+ *    load group+ *    remove PA from object (inode or locality group)+ *    mark free blocks in-core+ *+ *  - discard inode's preallocations:+ */++/*+ * Locking rules+ *+ * Locks:+ *  - bitlock on a group	(group)+ *  - object (inode/locality)	(object)+ *  - per-pa lock		(pa)+ *+ * Paths:+ *  - new pa+ *    object+ *    group+ *+ *  - find and use pa:+ *    pa+ *+ *  - release consumed pa:+ *    pa+ *    group+ *    object+ *+ *  - generate in-core bitmap:+ *    group+ *        pa+ *+ *  - discard all for given object (inode, locality group):+ *    object+ *        pa+ *    group+ *+ *  - discard all for given group:+ *    group+ *        pa+ *    group+ *        object+ *+ */++/*+ * with AGGRESSIVE_CHECK allocator runs consistency checks over+ * structures. these checks slow things down a lot+ */+#define AGGRESSIVE_CHECK__++/*+ * with DOUBLE_CHECK defined mballoc creates persistent in-core+ * bitmaps, maintains and uses them to check for double allocations+ */+#define DOUBLE_CHECK__++/*+ */+#define MB_DEBUG__+#ifdef MB_DEBUG+#define mb_debug(fmt,a...)	printk(fmt, ##a)+#else+#define mb_debug(fmt,a...)+#endif++/*+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory+ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history+ */+#define EXT3_MB_HISTORY+#define EXT3_MB_HISTORY_ALLOC		1	/* allocation */+#define EXT3_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */+#define EXT3_MB_HISTORY_DISCARD		4	/* preallocation discarded */+#define EXT3_MB_HISTORY_FREE		8	/* free */++#define EXT3_MB_HISTORY_DEFAULT		(EXT3_MB_HISTORY_ALLOC | \+					 EXT3_MB_HISTORY_PREALLOC | \+					 EXT3_MB_HISTORY_DISCARD | \+					 EXT3_MB_HISTORY_FREE)++/*+ * How long mballoc can look for a best extent (in found extents)+ */+#define MB_DEFAULT_MAX_TO_SCAN		200++/*+ * How long mballoc must look for a best extent

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -