📄 fft.c
字号:
/*#line 185 "/home/hg/splash2/codes/null_macros/c.m4.null.POSIX_BARRIER"*//*#line 1 "fft.C"*//*************************************************************************//* *//* Copyright (c) 1994 Stanford University *//* *//* All rights reserved. *//* *//* Permission is given to use, copy, and modify this software for any *//* non-commercial purpose as long as this copyright notice is not *//* removed. All other uses, including redistribution in whole or in *//* part, are forbidden without prior written permission. *//* *//* This software is provided with absolutely no warranty and no *//* support. *//* *//*************************************************************************//*************************************************************************//* *//* Perform 1D fast Fourier transform using six-step FFT method *//* *//* 1) Performs staggered, blocked transposes for cache-line reuse *//* 2) Roots of unity rearranged and distributed for only local *//* accesses during application of roots of unity *//* 3) Small set of roots of unity elements replicated locally for *//* 1D FFTs (less than root N elements replicated at each node) *//* 4) Matrix data structures are padded to reduce cache mapping *//* conflicts *//* *//* Command line options: *//* *//* -mM : M = even integer; 2**M total complex data points transformed. *//* -pP : P = number of processors; Must be a power of 2. *//* -nN : N = number of cache lines. *//* -lL : L = Log base 2 of cache line length in bytes. *//* -s : Print individual processor timing statistics. *//* -t : Perform FFT and inverse FFT. Test output by comparing the *//* integral of the original data to the integral of the data *//* that results from performing the FFT and inverse FFT. *//* -o : Print out complex data points. *//* -h : Print out command line options. *//* *//* Note: This version works under both the FORK and SPROC models *//* *//*************************************************************************//***********************************************************************************************//*全局宏及全局变量声明*//***********************************************************************************************/#include <stdio.h>#include <math.h>#include <pthread.h>#include <sys/time.h>#include <unistd.h>#include <stdlib.h>#include <malloc.h>#define PAGE_SIZE 4096 /*Cache参数需要改变,暂不复制,使用默认值*/#define NUM_CACHE_LINES 65536 /*Cache参数需要改变,暂不复制,使用默认值*/#define LOG2_LINE_SIZE 4 /*Cache参数需要改变,暂不复制,使用默认值*/#define PI 3.1416#define DEFAULT_M 10 /*FFT参数,暂使用默认值*/#define DEFAULT_P 64 /*处理器数量需要改变 */#define MAX_THREADS 32#define SWAP_VALS(a,b) {double tmp; tmp=a; a=b; b=tmp;} /*互换值函数*/pthread_t PThreadTable[MAX_THREADS]; /*建立一个类型为pthread内容为线程的数组*//*定义变量为*Global的结构体*/struct GlobalMemory { long id; pthread_mutex_t (idlock); /*互斥体型 idlock*/ pthread_barrier_t (start); /*barrier型 start*/ long *transtimes; long *totaltimes; unsigned long starttime; unsigned long finishtime; unsigned long initdonetime;} *Global; long P = DEFAULT_P ; /*该默认值为测试方案中的数值,即为64*/long M = DEFAULT_M;long N; /* N = 2^M */long rootN; /* rootN = N^1/2 */double *x; /* x is the original time-domain data */double *trans; /* trans is used as scratch space */double *umain; /* umain is roots of unity for 1D FFTs */double *umain2; /* umain2 is entire roots of unity matrix */long test_result = 1; /* 可以进行结果正确性的判断,但可以不将判断结果打印,赋值为1 */long doprint = 1; /* 不需要打印结果,但需要赋值为1 */long dostats = 1; /* 需要各个处理器结果输出,赋值为1 */long transtime = 0;long transtime2 = 0;long avgtranstime = 0;long avgcomptime = 0;unsigned long transstart = 0;unsigned long transend = 0;long maxtotal=0;long mintotal=0;double maxfrac=0;double minfrac=0;double avgfractime=0;long orig_num_lines = NUM_CACHE_LINES; /* number of cache lines 需要输入数据,暂使用默认值*/long num_cache_lines = NUM_CACHE_LINES; /* number of cache lines 需要输入数据,暂使用默认值*/long log2_line_size = LOG2_LINE_SIZE; /* 需要输入数据,暂使用默认值*/long line_size;long rowsperproc;/*double ck1;*//*double ck3;*/ /* checksums for testing answer */long pad_length;/***********************************************************************************************//*对整个程序需调用的函数进行声明*//***********************************************************************************************/void SlaveStart(void);double TouchArray(double *x, double *scratch, double *u, double *upriv, long MyFirst, long MyLast);double CheckSum(double *x);void InitX(double *x);void InitU(long N, double *u);void InitU2(long N, double *u, long n1);long BitReverse(long M, long k);void FFT1D(long direction, long M, long N, double *x, double *scratch, double *upriv, double *umain2, long MyNum, long *l_transtime, long MyFirst, long MyLast, long pad_length, long test_result, long dostats);void TwiddleOneCol(long direction, long n1, long j, double *u, double *x, long pad_length);void Scale(long n1, long N, double *x);void Transpose(long n1, double *src, double *dest, long MyNum, long MyFirst, long MyLast, long pad_length);void CopyColumn(long n1, double *src, double *dest);void Reverse(long N, long M, double *x);void FFT1DOnce(long direction, long M, long N, double *u, double *x);/*void PrintArray(long N, double *x); 不需要打印FFT结果,注释掉*/void printerr(char *s);long log_2(long number);void srand48(long int seedval);double drand48(void);/***********************************************************************************************//*主函数区域*//***********************************************************************************************/int main(int argc, char *argv[]){ long i; long c; /*extern char *optarg;*/ /*直接赋值,无需从外部输入数据*/ long m1; long factor; long pages; unsigned long start; { struct timeval FullTime; gettimeofday(&FullTime, NULL); (start) = (unsigned long)(FullTime.tv_usec + FullTime.tv_sec * 1000000);};/***********************************************************************************************//* 判断输入数据是否合法,并初始化整个函数,测试时无需进行判断 while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) { switch(c) { case 'p': P = atoi(optarg); if (P < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(P) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'm': M = atoi(optarg); m1 = M/2; if (2*m1 != M) { printerr("M must be even\n"); exit(-1); } break; case 'n': num_cache_lines = atoi(optarg); orig_num_lines = num_cache_lines; if (num_cache_lines < 1) { printerr("Number of cache lines must be >= 1\n"); exit(-1); } break; case 'l': log2_line_size = atoi(optarg); if (log2_line_size < 0) { printerr("Log base 2 of cache line length in bytes must be >= 0\n"); exit(-1); } break; case 's': dostats = !dostats; break; case 't': test_result = !test_result; break; case 'o': doprint = !doprint; break; case 'h': printf("Usage: FFT <options>\n\n"); printf("options:\n"); printf(" -mM : M = even integer; 2**M total complex data points transformed.\n"); printf(" -pP : P = number of processors; Must be a power of 2.\n"); printf(" -nN : N = number of cache lines.\n"); printf(" -lL : L = Log base 2 of cache line length in bytes.\n"); printf(" -s : Print individual processor timing statistics.\n"); printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n"); printf(" integral of the original data to the integral of the data that\n"); printf(" results from performing the FFT and inverse FFT.\n"); printf(" -o : Print out complex data points.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n", DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE); exit(0); break; } }*//***********************************************************************************************/ {;};/*Padding algorithm */ N = 1<<M; /*M = even integer; 2**M total complex data points transformed.*/ rootN = 1<<(M/2); rowsperproc = rootN/P;/************************************************************************************************//* 无需对rowperproc值进行判断 if (rowsperproc == 0) { printerr("Matrix not large enough. 2**(M/2) must be >= P\n"); exit(-1); }*/ /************************************************************************************************/ line_size = 1 << log2_line_size; if (line_size < 2*sizeof(double)) { printf("WARNING: Each element is a complex double (%ld bytes)\n",2*sizeof(double)); printf(" => Less than one element per cache line\n"); printf(" Computing transpose blocking factor\n"); factor = (2*sizeof(double)) / line_size; num_cache_lines = orig_num_lines / factor; } if (line_size <= 2*sizeof(double)) { pad_length = 1; } else { pad_length = line_size / (2*sizeof(double)); } if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) { pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE; if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) { pages ++; } pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc); } else { pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) / (2 * sizeof(double) * rowsperproc); if (pad_length * (2 * sizeof(double) * rowsperproc) != (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) { printerr("Padding algorithm unsuccessful\n"); exit(-1); } } /*使用Valloc函数为变量分配内存,地址为memory每页的边界*/ Global = (struct GlobalMemory *) valloc(sizeof(struct GlobalMemory));; x = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);; trans = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);; umain = (double *) valloc(2*rootN*sizeof(double));; umain2 = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);; Global->transtimes = (long *) valloc(P*sizeof(long));; Global->totaltimes = (long *) valloc(P*sizeof(long));; /************************************************************************************************/ /* 无需进行判断 if (Global == NULL) { printerr("Could not malloc memory for Global\n"); exit(-1); } else if (x == NULL) { printerr("Could not malloc memory for x\n"); exit(-1); } else if (trans == NULL) { printerr("Could not malloc memory for trans\n"); exit(-1); } else if (umain == NULL) { printerr("Could not malloc memory for umain\n"); exit(-1); } else if (umain2 == NULL) { printerr("Could not malloc memory for umain2\n"); exit(-1); }*//************************************************************************************************/ x = (double *) (((unsigned long) x) + PAGE_SIZE - ((unsigned long) x) % PAGE_SIZE); trans = (double *) (((unsigned long) trans) + PAGE_SIZE - ((unsigned long) trans) % PAGE_SIZE); umain2 = (double *) (((unsigned long) umain2) + PAGE_SIZE - ((unsigned long) umain2) % PAGE_SIZE);/************************************************************************************************//* 内存分配算法的解释及改进算法 In order to optimize data distribution, the data structures x, trans, and umain2 have been aligned so that each begins on a page boundary. This ensures that the amount of padding calculated by the program is such that each processor's partition ends on a page boundary, thus ensuring that all data from these structures that are needed by a processor can be allocated to its local memory *//* POSSIBLE ENHANCEMENT: Here is where one might distribute the x, trans, and umain2 data structures across physically distributed memories as desired. One way to place data is as follows: double *base; long i; i = ((N/P)+(rootN/P)*pad_length)*2; base = &(x[0]); for (j=0;j<P;j++) { Place all addresses x such that (base <= x < base+i) on node j base += i; } The trans and umain2 data structures can be placed in a similar manner. *//************************************************************************************************//************************************************************************************************//* 打印处理器配置信息,测试时无需打印输出 printf("\n"); printf("FFT with Blocking Transpose\n"); printf(" %ld Complex Doubles\n",N); printf(" %ld Processors\n",P); if (num_cache_lines != orig_num_lines) { printf(" %ld Cache lines\n",orig_num_lines); printf(" %ld Cache lines for blocking transpose\n",num_cache_lines); } else { printf(" %ld Cache lines\n",num_cache_lines); } printf(" %d Byte line size\n",(1 << log2_line_size)); printf(" %d Bytes per page\n",PAGE_SIZE); printf("\n");*/ /************************************************************************************************/ { pthread_barrier_init(&(Global->start), NULL, P);/*barrier初始化 */ };
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -