numa.c

来自「LINUX 2.6.17.4的源码」· C语言 代码 · 共 828 行 · 第 1/2 页

C
828
字号
/* * pSeries NUMA support * * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */#include <linux/threads.h>#include <linux/bootmem.h>#include <linux/init.h>#include <linux/mm.h>#include <linux/mmzone.h>#include <linux/module.h>#include <linux/nodemask.h>#include <linux/cpu.h>#include <linux/notifier.h>#include <asm/sparsemem.h>#include <asm/lmb.h>#include <asm/system.h>#include <asm/smp.h>static int numa_enabled = 1;static int numa_debug;#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }int numa_cpu_lookup_table[NR_CPUS];cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];struct pglist_data *node_data[MAX_NUMNODES];EXPORT_SYMBOL(numa_cpu_lookup_table);EXPORT_SYMBOL(numa_cpumask_lookup_table);EXPORT_SYMBOL(node_data);static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];static int min_common_depth;static int n_mem_addr_cells, n_mem_size_cells;/* * We need somewhere to store start/end/node for each region until we have * allocated the real node_data structures. */#define MAX_REGIONS	(MAX_LMB_REGIONS*2)static struct {	unsigned long start_pfn;	unsigned long end_pfn;	int nid;} init_node_data[MAX_REGIONS] __initdata;int __init early_pfn_to_nid(unsigned long pfn){	unsigned int i;	for (i = 0; init_node_data[i].end_pfn; i++) {		unsigned long start_pfn = init_node_data[i].start_pfn;		unsigned long end_pfn = init_node_data[i].end_pfn;		if ((start_pfn <= pfn) && (pfn < end_pfn))			return init_node_data[i].nid;	}	return -1;}void __init add_region(unsigned int nid, unsigned long start_pfn,		       unsigned long pages){	unsigned int i;	dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n",		nid, start_pfn, pages);	for (i = 0; init_node_data[i].end_pfn; i++) {		if (init_node_data[i].nid != nid)			continue;		if (init_node_data[i].end_pfn == start_pfn) {			init_node_data[i].end_pfn += pages;			return;		}		if (init_node_data[i].start_pfn == (start_pfn + pages)) {			init_node_data[i].start_pfn -= pages;			return;		}	}	/*	 * Leave last entry NULL so we dont iterate off the end (we use	 * entry.end_pfn to terminate the walk).	 */	if (i >= (MAX_REGIONS - 1)) {		printk(KERN_ERR "WARNING: too many memory regions in "				"numa code, truncating\n");		return;	}	init_node_data[i].start_pfn = start_pfn;	init_node_data[i].end_pfn = start_pfn + pages;	init_node_data[i].nid = nid;}/* We assume init_node_data has no overlapping regions */void __init get_region(unsigned int nid, unsigned long *start_pfn,		       unsigned long *end_pfn, unsigned long *pages_present){	unsigned int i;	*start_pfn = -1UL;	*end_pfn = *pages_present = 0;	for (i = 0; init_node_data[i].end_pfn; i++) {		if (init_node_data[i].nid != nid)			continue;		*pages_present += init_node_data[i].end_pfn -			init_node_data[i].start_pfn;		if (init_node_data[i].start_pfn < *start_pfn)			*start_pfn = init_node_data[i].start_pfn;		if (init_node_data[i].end_pfn > *end_pfn)			*end_pfn = init_node_data[i].end_pfn;	}	/* We didnt find a matching region, return start/end as 0 */	if (*start_pfn == -1UL)		*start_pfn = 0;}static void __cpuinit map_cpu_to_node(int cpu, int node){	numa_cpu_lookup_table[cpu] = node;	dbg("adding cpu %d to node %d\n", cpu, node);	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))		cpu_set(cpu, numa_cpumask_lookup_table[node]);}#ifdef CONFIG_HOTPLUG_CPUstatic void unmap_cpu_from_node(unsigned long cpu){	int node = numa_cpu_lookup_table[cpu];	dbg("removing cpu %lu from node %d\n", cpu, node);	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {		cpu_clear(cpu, numa_cpumask_lookup_table[node]);	} else {		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",		       cpu, node);	}}#endif /* CONFIG_HOTPLUG_CPU */static struct device_node * __cpuinit find_cpu_node(unsigned int cpu){	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);	struct device_node *cpu_node = NULL;	unsigned int *interrupt_server, *reg;	int len;	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {		/* Try interrupt server first */		interrupt_server = (unsigned int *)get_property(cpu_node,					"ibm,ppc-interrupt-server#s", &len);		len = len / sizeof(u32);		if (interrupt_server && (len > 0)) {			while (len--) {				if (interrupt_server[len] == hw_cpuid)					return cpu_node;			}		} else {			reg = (unsigned int *)get_property(cpu_node,							   "reg", &len);			if (reg && (len > 0) && (reg[0] == hw_cpuid))				return cpu_node;		}	}	return NULL;}/* must hold reference to node during call */static int *of_get_associativity(struct device_node *dev){	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);}/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */static int of_node_to_nid_single(struct device_node *device){	int nid = -1;	unsigned int *tmp;	if (min_common_depth == -1)		goto out;	tmp = of_get_associativity(device);	if (!tmp)		goto out;	if (tmp[0] >= min_common_depth)		nid = tmp[min_common_depth];	/* POWER4 LPAR uses 0xffff as invalid node */	if (nid == 0xffff || nid >= MAX_NUMNODES)		nid = -1;out:	return nid;}/* Walk the device tree upwards, looking for an associativity id */int of_node_to_nid(struct device_node *device){	struct device_node *tmp;	int nid = -1;	of_node_get(device);	while (device) {		nid = of_node_to_nid_single(device);		if (nid != -1)			break;	        tmp = device;		device = of_get_parent(tmp);		of_node_put(tmp);	}	of_node_put(device);	return nid;}EXPORT_SYMBOL_GPL(of_node_to_nid);/* * In theory, the "ibm,associativity" property may contain multiple * associativity lists because a resource may be multiply connected * into the machine.  This resource then has different associativity * characteristics relative to its multiple connections.  We ignore * this for now.  We also assume that all cpu and memory sets have * their distances represented at a common level.  This won't be * true for heirarchical NUMA. * * In any case the ibm,associativity-reference-points should give * the correct depth for a normal NUMA system. * * - Dave Hansen <haveblue@us.ibm.com> */static int __init find_min_common_depth(void){	int depth;	unsigned int *ref_points;	struct device_node *rtas_root;	unsigned int len;	rtas_root = of_find_node_by_path("/rtas");	if (!rtas_root)		return -1;	/*	 * this property is 2 32-bit integers, each representing a level of	 * depth in the associativity nodes.  The first is for an SMP	 * configuration (should be all 0's) and the second is for a normal	 * NUMA configuration.	 */	ref_points = (unsigned int *)get_property(rtas_root,			"ibm,associativity-reference-points", &len);	if ((len >= 1) && ref_points) {		depth = ref_points[1];	} else {		dbg("NUMA: ibm,associativity-reference-points not found.\n");		depth = -1;	}	of_node_put(rtas_root);	return depth;}static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells){	struct device_node *memory = NULL;	memory = of_find_node_by_type(memory, "memory");	if (!memory)		panic("numa.c: No memory nodes found!");	*n_addr_cells = prom_n_addr_cells(memory);	*n_size_cells = prom_n_size_cells(memory);	of_node_put(memory);}static unsigned long __devinit read_n_cells(int n, unsigned int **buf){	unsigned long result = 0;	while (n--) {		result = (result << 32) | **buf;		(*buf)++;	}	return result;}/* * Figure out to which domain a cpu belongs and stick it there. * Return the id of the domain used. */static int __cpuinit numa_setup_cpu(unsigned long lcpu){	int nid = 0;	struct device_node *cpu = find_cpu_node(lcpu);	if (!cpu) {		WARN_ON(1);		goto out;	}	nid = of_node_to_nid_single(cpu);	if (nid < 0 || !node_online(nid))		nid = any_online_node(NODE_MASK_ALL);out:	map_cpu_to_node(lcpu, nid);	of_node_put(cpu);	return nid;}static int cpu_numa_callback(struct notifier_block *nfb,			     unsigned long action,			     void *hcpu){	unsigned long lcpu = (unsigned long)hcpu;	int ret = NOTIFY_DONE;	switch (action) {	case CPU_UP_PREPARE:		numa_setup_cpu(lcpu);		ret = NOTIFY_OK;		break;#ifdef CONFIG_HOTPLUG_CPU	case CPU_DEAD:	case CPU_UP_CANCELED:		unmap_cpu_from_node(lcpu);		break;		ret = NOTIFY_OK;#endif	}	return ret;}/* * Check and possibly modify a memory region to enforce the memory limit. * * Returns the size the region should have to enforce the memory limit. * This will either be the original value of size, a truncated value, * or zero. If the returned value of size is 0 the region should be * discarded as it lies wholy above the memory limit. */static unsigned long __init numa_enforce_memory_limit(unsigned long start,						      unsigned long size){	/*	 * We use lmb_end_of_DRAM() in here instead of memory_limit because	 * we've already adjusted it for the limit and it takes care of	 * having memory holes below the limit.	 */	if (! memory_limit)		return size;	if (start + size <= lmb_end_of_DRAM())		return size;	if (start >= lmb_end_of_DRAM())		return 0;	return lmb_end_of_DRAM() - start;}static int __init parse_numa_properties(void){	struct device_node *cpu = NULL;	struct device_node *memory = NULL;	int default_nid = 0;	unsigned long i;	if (numa_enabled == 0) {		printk(KERN_WARNING "NUMA disabled by user\n");		return -1;	}	min_common_depth = find_min_common_depth();	if (min_common_depth < 0)		return min_common_depth;	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);	/*	 * Even though we connect cpus to numa domains later in SMP	 * init, we need to know the node ids now. This is because	 * each node to be onlined must have NODE_DATA etc backing it.	 */	for_each_present_cpu(i) {		int nid;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?