📄 005_mm_bootmem_c.html
字号:
flow: static(header); } /* used to insert page numbers */ div.google_header::before, div.google_footer::before { position: absolute; top: 0; } div.google_footer { flow: static(footer); } /* always consider this element at the start of the doc */ div#google_footer { flow: static(footer, start); } span.google_pagenumber { content: counter(page); } span.google_pagecount { content: counter(pages); } } @page { @top { content: flow(header); } @bottom { content: flow(footer); } } /* end default print css */ /* custom css *//* end custom css */ /* ui edited css */ body { font-family: Verdana; font-size: 10.0pt; line-height: normal; background-color: #ffffff; } .documentBG { background-color: #ffffff; } /* end ui edited css */</style> </head> <body revision="dcbsxfpf_46p2hgnhk:4"> <table align=center cellpadding=0 cellspacing=0 height=5716 width=768>
<tbody>
<tr>
<td height=5716 valign=top width=100%>
<pre>2005-10-24<br>mm/bootmem.c <br> <br> 上次对启动过程有个简单总结,下面再次关注一下其中的bootmem部分。<br>setup.S->asmlinkage void __init start_kernel(void) (init/main.c)<br> |<br> +-->setup_arch ---> 处理e820内存报告<br> --> 关于内存的提示信息<br> ---> 初始化bootmem (init_bootmem)<br> ---> paging_init--+<br> +-------+<br> | +--> pagetable_init(含fix map,vmalloc init)<br> \ / +--> load cr3<br> . +--> kmap_init <br> . +--> free_area_init(zone-buddy初始化)<br> . --->smp,apic,roms等处理<br> +--> idt gate modules,kmem_cache_init<br> |<br> +--> mem_init -->free_all_bootmem buddy 得到页面控制权<br> +<br> +--> proc_root_init,fork_init, ipc,inode<br> +--> smp_init<br> +<br> +--> 创建kernel thread, init (init/main.c->函数init)<br> +--->do_basic_setup<br> ---->init pci,mtrr,sysctl,mca....<br> ---->filesystem_setup<br> ---->mount_root (关注...)<br> ---->......<br> +---> free_initmem<br> +---> 打开console<br> +--->execve("/sbin/init",argv_init,envp_init);<br> +--->execve("/etc/init",argv_init,envp_init);<br> +--->execve("/bin/init",argv_init,envp_init);<br> +--->execve("/bin/sh",argv_init,envp_init);<br><br> 首先是setup_arch(arch/i386/kernel/setup.c), <br>void __init setup_arch(char **cmdline_p)<br>{<br> .....<br> /*<br> * partially used pages are not usable - thus<br> * we are rounding upwards:<br> */<br> start_pfn = PFN_UP(__pa(&_end)); /*<br> * boot mem 只能使用_end之后<br> * 的内存<br> */<br> //接着从e820报告中找最高地址的ram页面<br> //其pfn赋值给max_pfn,代码略<br> <br> //然后的代码寻找 max_low_pfn, highstart_pfn,highend_pfn<br> //逻辑简单,不再罗列<br> <br> /*<br> * Initialize the boot-time allocator (with low memory only):<br> */<br> /*<br> * 初始化boot mem,只使用 low memory<br> */<br> bootmap_size = init_bootmem(start_pfn, max_low_pfn);<br> <br> /*<br> * Register fully available low RAM pages with the bootmem allocator.<br> */<br> for (i = 0; i < e820.nr_map; i++) { //从e820中寻找ram<br> unsigned long curr_pfn, last_pfn, size;<br> /*<br> * Reserve usable low memory<br> */<br> if (e820.map[i].type != E820_RAM)<br> continue;<br> /*<br> * We are rounding up the start address of usable memory:<br> */<br> curr_pfn = PFN_UP(e820.map[i].addr);<br> if (curr_pfn >= max_low_pfn)<br> continue;<br> /*<br> * ... and at the end of the usable range downwards:<br> */<br> last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);<br><br> if (last_pfn > max_low_pfn)<br> last_pfn = max_low_pfn;<br><br> /*<br> * .. finally, did all the rounding and playing<br> * around just make the area go away?<br> */<br> if (last_pfn <= curr_pfn)<br> continue;<br><br> size = last_pfn - curr_pfn;<br> /*<br> * 注册ram到bootmem(标记页面为free)<br> */<br> free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));<br> }<br><br> /*<br> * Reserve the bootmem bitmap itself as well. We do this in two<br> * steps (first step was init_bootmem()) because this catches<br> * the (very unlikely) case of us accidentally initializing the<br> * bootmem allocator with an invalid RAM area.<br> */<br> /* 保留从物理地址1M开始(内核加载地址),<br> * 大小是内核image+bootmem bitmap 的物理内存<br> */<br> reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +<br> bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));<br><br> /*<br> * reserve physical page 0 - it's a special BIOS page on many boxes,<br> * enabling clean reboots, SMP operation, laptop functions.<br> */<br> reserve_bootmem(0, PAGE_SIZE); /*保留page pfn 0*/<br> <br> ..........<br> <br><br>}<br><br> 然后在mem_init调用free_all_bootmem前就可以使用bootmem分配内存了.<br>当mem_init调用此函数后,buddy系统可以工作了.bootmem的使命即告终结.<br><br> bootmem的管理结构如下:<br> typedef struct bootmem_data {<br> unsigned long node_boot_start; /*所能看到的page的起始地址*/<br> unsigned long node_low_pfn; /*此node bootmme可管理的最大pfn*/<br> void *node_bootmem_map; /*bootmem bit位图,一般是可以使用的起始地址(除去内核image)*/<br> unsigned long last_offset; /*上次分配的内存的结束地址在last_pos内的偏移*/<br> unsigned long last_pos; /*上次分配的内存所使用的最后一个页面的pfn*/<br>} bootmem_data_t;<br> bootmem_data_t属于node, pg_data_t.<br> 在NUMA系统中,每个节点属于一个node,pgdat.<br><br>typedef struct pglist_data {<br> zone_t node_zones[MAX_NR_ZONES];<br> zonelist_t node_zonelists[NR_GFPINDEX];<br> struct page *node_mem_map; /*NUMA系统中的page结构数组,不再是<br> *全局变量mem_map了<br> */<br> unsigned long *valid_addr_bitmap;<br> struct bootmem_data *bdata;<br> unsigned long node_start_paddr;<br> unsigned long node_start_mapnr;<br> unsigned long node_size;<br> int node_id;<br> struct pglist_data *node_next;<br>} pg_data_t;<br> <br> NUMA系统中的page结构数组,不再是局变量mem_map了,而是每个zone的<br>pgdat.node_mem_map.<br> linux2.4.0中对NUMA的处理比较晦涩,不如直接看2.6的代码. 在2.6中看宏<br>#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)<br> <br> 在NUMA系统pfn_to_page定义如下:<br> #define pfn_to_page(pfn) \<br>({ \<br> unsigned long __pfn = pfn; \<br> int __node = pfn_to_nid(__pfn); \<br> &node_mem_map(__node)[node_localnr(__pfn,__node)]; \<br>})<br> <br>#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)<br><br> 而pfn_to_nid在mmzone.h定义为<br>/*<br> * generic node memory support, the following assumptions apply:<br> *<br> * 1) memory comes in 256Mb contigious chunks which are either present or not<br> * 2) we will not have more than 64Gb in total<br> *<br> * for now assume that 64Gb is max amount of RAM for whole system<br> * 64Gb / 4096bytes/page = 16777216 pages<br> */<br>#define MAX_NR_PAGES 16777216<br>#define MAX_ELEMENTS 256<br>#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)<br><br>extern s8 physnode_map[];<br><br>static inline int pfn_to_nid(unsigned long pfn)<br>{<br>#ifdef CONFIG_NUMA<br> return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);<br>#else<br> return 0;<br>#endif<br>}<br><br> 从其中注释,容易看懂pgdat之作用,以及zone的划分方式.<br> <br> <br> bootmem.c涉及到的函数应该不难,有几个相关的注释,只是罗列于此,以<br>保分析完整.<br><br><br><br><br>/* return the number of _pages_ that will be allocated for the boot bitmap */<br>unsigned long __init bootmem_bootmap_pages (unsigned long pages)<br>{<br> unsigned long mapsize;<br><br> mapsize = (pages+7)/8; //每页用一个bit管理,转换为字节,宁多勿少<br> mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; //字节数安4k对其,宁多勿少 <br> mapsize >>= PAGE_SHIFT; //换成页面个数<br><br> return mapsize; //pages 个页面需要mapsize个页面来管理<br>}<br><br>/*<br> * Called once to set up the allocator itself.<br> * mapstart: bootmem所能够"管理"页面的起始pfn (i386 就是 内核_end以上) <br> * start: 此node的起始pfn,登记到位图,但只能从mapstart开始使用<br> * end : bootmem所管理的最后一个页面的pfn<br> */<br>static unsigned long __init init_bootmem_core (pg_data_t *pgdat,<br> unsigned long mapstart, unsigned long start, unsigned long end)<br>{<br> bootmem_data_t *bdata = pgdat->bdata;<br> unsigned long mapsize = ((end - start)+7)/8;<br> //每页用一个bit管理,转换为字节,宁多勿少<br><br> pgdat->node_next = pgdat_list; //pglist_datag,NUMA , 每node一个pgdat<br> pgdat_list = pgdat;<br><br> mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);<br><br> bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);<br> //只设置了管理位图的起始地址,需要明确调用reserve_bootmem保留内核<br> //和boot mem 自己使用的内存<br><br> bdata->node_boot_start = (start << PAGE_SHIFT);<br> bdata->node_low_pfn = end;<br><br> /*<br> * Initially all pages are reserved - setup_arch() has to<br> * register free RAM areas explicitly.<br> * 开始都处于已使用状态,需要setup_arch 注册ram页面<br> */<br> memset(bdata->node_bootmem_map, 0xff, mapsize);<br><br> return mapsize;<br>}<br><br><br>/*<br> * We 'merge' subsequent allocations to save space. We might 'lose'<br> * some fraction of a page if allocations cannot be satisfied due to<br> * size constraints on boxes where there is physical RAM space<br> * fragmentation - in these cases * (mostly large memory boxes) this<br> * is not a problem.<br> *<br> * On low memory boxes we get it right in 100% of the cases.<br> */<br><br>/*<br> * alignment has to be a power of 2 value.<br> */<br> /*<br> * align : 按align对齐<br> * goal: 请求目标地址(以上的)内存<br> */<br>static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, <br> unsigned long size, unsigned long align, unsigned long goal)<br>{<br> unsigned long i, start = 0; //start is pfn<br> void *ret;<br> unsigned long offset, remaining_size;<br> unsigned long areasize, preferred, incr;//areasize is pfn number<br> unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >><br> PAGE_SHIFT);<br><br> if (!size) BUG();<br><br> /*<br> * We try to allocate bootmem pages above 'goal'<br> * first, then we try to allocate lower pages.<br> */<br> if (goal && (goal >= bdata->node_boot_start) && <br> ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {<br> preferred = goal - bdata->node_boot_start;<br> } else<br> preferred = 0;<br><br> /* 按要求对齐 */<br> preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; <br> areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; /*size 按page对齐,up round*/<br> incr = align >> PAGE_SHIFT ? : 1; /*按对齐要求计算每次步进几个pfn*/<br><br>restart_scan:<br> for (i = preferred; i < eidx; i += incr) {<br> unsigned long j;<br> if (test_bit(i, bdata->node_bootmem_map))<br> continue;<br> for (j = i + 1; j < i + areasize; ++j) {<br> if (j >= eidx)<br> goto fail_block;<br> if (test_bit (j, bdata->node_bootmem_map))<br> goto fail_block;<br> }<br> start = i;<br> goto found; /*找到了所要求的几个连续的page*/<br> fail_block:;<br> }<br> if (preferred) {<br> preferred = 0; /*<br>不能满足goal 要求,尝试从node_boot_start开始寻找*/<br> goto restart_scan;<br> }<br>found:<br> if (start >= eidx)<br> BUG();<br><br> /*<br> * Is the next page of the previous allocation-end the start<br> * of this allocation's buffer? If yes then we can 'merge'<br> * the previous partial page with this allocation.<br> */<br> if (align <= PAGE_SIZE<br> && bdata->last_offset && bdata->last_pos+1 == start) {<br> offset = (bdata->last_offset+align-1) & ~(align-1);<br> if (offset > PAGE_SIZE)<br> BUG();<br> remaining_size = PAGE_SIZE-offset;<br> if (size < remaining_size) {<br> areasize = 0;<br> // last_pos unchanged<br> bdata->last_offset = offset+size;<br> ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +<br> bdata->node_boot_start);<br> } else {<br> remaining_size = size - remaining_size;<br> areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;<br> ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +<br> bdata->node_boot_start);<br> bdata->last_pos = start+areasize-1;<br> bdata->last_offset = remaining_size;<br> }<br> bdata->last_offset &= ~PAGE_MASK;<br> } else {<br> bdata->last_pos = start + areasize - 1;/*start包含在内,故减1*/<br> bdata->last_offset = size & ~PAGE_MASK; /*<br> * 上次分配的结束地址在页面last_pos<br> * 内的偏移<br> */<br> ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);<br> }<br> /*<br> * Reserve the area now:<br> */<br> for (i = start; i < start+areasize; i++)<br> if (test_and_set_bit(i, bdata->node_bootmem_map))<br> BUG();<br> memset(ret, 0, size);<br> return ret;<br>}<br><br><br>/*<br> * 释放未使用的页面和自己使用的页面到buddy系统<br> */<br>static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)<br><br><br> 罗列的代码,其中有注释.其余函数.....算来吧,没有注释,用不着了.<br></pre>
</td>
</tr>
</tbody>
</table></body></html>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -