⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fft.c

📁 多核(64核)系统的并行FFT运算程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/*#line 185 "/home/hg/splash2/codes/null_macros/c.m4.null.POSIX_BARRIER"*//*#line 1 "fft.C"*//*************************************************************************//*                                                                       *//*  Copyright (c) 1994 Stanford University                               *//*                                                                       *//*  All rights reserved.                                                 *//*                                                                       *//*  Permission is given to use, copy, and modify this software for any   *//*  non-commercial purpose as long as this copyright notice is not       *//*  removed.  All other uses, including redistribution in whole or in    *//*  part, are forbidden without prior written permission.                *//*                                                                       *//*  This software is provided with absolutely no warranty and no         *//*  support.                                                             *//*                                                                       *//*************************************************************************//*************************************************************************//*                                                                       *//*  Perform 1D fast Fourier transform using six-step FFT method          *//*                                                                       *//*  1) Performs staggered, blocked transposes for cache-line reuse       *//*  2) Roots of unity rearranged and distributed for only local          *//*     accesses during application of roots of unity                     *//*  3) Small set of roots of unity elements replicated locally for       *//*     1D FFTs (less than root N elements replicated at each node)       *//*  4) Matrix data structures are padded to reduce cache mapping         *//*     conflicts                                                         *//*                                                                       *//*  Command line options:                                                *//*                                                                       *//*  -mM : M = even integer; 2**M total complex data points transformed.  *//*  -pP : P = number of processors; Must be a power of 2.                *//*  -nN : N = number of cache lines.                                     *//*  -lL : L = Log base 2 of cache line length in bytes.                  *//*  -s  : Print individual processor timing statistics.                  *//*  -t  : Perform FFT and inverse FFT.  Test output by comparing the     *//*        integral of the original data to the integral of the data      *//*        that results from performing the FFT and inverse FFT.          *//*  -o  : Print out complex data points.                                 *//*  -h  : Print out command line options.                                *//*                                                                       *//*  Note: This version works under both the FORK and SPROC models        *//*                                                                       *//*************************************************************************//***********************************************************************************************//*全局宏及全局变量声明*//***********************************************************************************************/#include <stdio.h>#include <math.h>#include <pthread.h>#include <sys/time.h>#include <unistd.h>#include <stdlib.h>#include <malloc.h>#define PAGE_SIZE               4096        /*Cache参数需要改变,暂不复制,使用默认值*/#define NUM_CACHE_LINES        65536        /*Cache参数需要改变,暂不复制,使用默认值*/#define LOG2_LINE_SIZE             4        /*Cache参数需要改变,暂不复制,使用默认值*/#define PI                         3.1416#define DEFAULT_M                 10        /*FFT参数,暂使用默认值*/#define DEFAULT_P                 64        /*处理器数量需要改变 */#define MAX_THREADS 32#define SWAP_VALS(a,b) {double tmp; tmp=a; a=b; b=tmp;} /*互换值函数*/pthread_t PThreadTable[MAX_THREADS];        /*建立一个类型为pthread内容为线程的数组*//*定义变量为*Global的结构体*/struct GlobalMemory {  long id;  pthread_mutex_t (idlock);                 /*互斥体型 idlock*/  pthread_barrier_t (start);                /*barrier型 start*/  long *transtimes;  long *totaltimes;  unsigned long starttime;  unsigned long finishtime;  unsigned long initdonetime;} *Global; long P = DEFAULT_P ;    /*该默认值为测试方案中的数值,即为64*/long M = DEFAULT_M;long N;                 /* N = 2^M                                */long rootN;             /* rootN = N^1/2                          */double *x;              /* x is the original time-domain data     */double *trans;          /* trans is used as scratch space         */double *umain;          /* umain is roots of unity for 1D FFTs    */double *umain2;         /* umain2 is entire roots of unity matrix */long test_result = 1;   /*  可以进行结果正确性的判断,但可以不将判断结果打印,赋值为1       */long doprint = 1;       /*  不需要打印结果,但需要赋值为1                                   */long dostats = 1;       /*  需要各个处理器结果输出,赋值为1                                 */long transtime = 0;long transtime2 = 0;long avgtranstime = 0;long avgcomptime = 0;unsigned long transstart = 0;unsigned long transend = 0;long maxtotal=0;long mintotal=0;double maxfrac=0;double minfrac=0;double avgfractime=0;long orig_num_lines = NUM_CACHE_LINES;     /* number of cache lines 需要输入数据,暂使用默认值*/long num_cache_lines = NUM_CACHE_LINES;    /* number of cache lines 需要输入数据,暂使用默认值*/long log2_line_size = LOG2_LINE_SIZE;      /*                       需要输入数据,暂使用默认值*/long line_size;long rowsperproc;/*double ck1;*//*double ck3;*/                        /* checksums for testing answer */long pad_length;/***********************************************************************************************//*对整个程序需调用的函数进行声明*//***********************************************************************************************/void SlaveStart(void);double TouchArray(double *x, double *scratch, double *u, double *upriv, long MyFirst, long MyLast);double CheckSum(double *x);void InitX(double *x);void InitU(long N, double *u);void InitU2(long N, double *u, long n1);long BitReverse(long M, long k);void FFT1D(long direction, long M, long N, double *x, double *scratch, double *upriv, double *umain2,	   long MyNum, long *l_transtime, long MyFirst, long MyLast, long pad_length, long test_result, long dostats);void TwiddleOneCol(long direction, long n1, long j, double *u, double *x, long pad_length);void Scale(long n1, long N, double *x);void Transpose(long n1, double *src, double *dest, long MyNum, long MyFirst, long MyLast, long pad_length);void CopyColumn(long n1, double *src, double *dest);void Reverse(long N, long M, double *x);void FFT1DOnce(long direction, long M, long N, double *u, double *x);/*void PrintArray(long N, double *x);   不需要打印FFT结果,注释掉*/void printerr(char *s);long log_2(long number);void srand48(long int seedval);double drand48(void);/***********************************************************************************************//*主函数区域*//***********************************************************************************************/int main(int argc, char *argv[]){  long i;   long c;  /*extern char *optarg;*/  /*直接赋值,无需从外部输入数据*/  long m1;  long factor;  long pages;  unsigned long start;  {	struct timeval	FullTime;	gettimeofday(&FullTime, NULL);	(start) = (unsigned long)(FullTime.tv_usec + FullTime.tv_sec * 1000000);};/***********************************************************************************************//*   判断输入数据是否合法,并初始化整个函数,测试时无需进行判断    while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) {          switch(c) {      case 'p': P = atoi(optarg);                 if (P < 1) {                  printerr("P must be >= 1\n");                  exit(-1);                }                if (log_2(P) == -1) {                  printerr("P must be a power of 2\n");                  exit(-1);                }	        break;        case 'm': M = atoi(optarg);                 m1 = M/2;                if (2*m1 != M) {                  printerr("M must be even\n");                  exit(-1);                }	        break;        case 'n': num_cache_lines = atoi(optarg);                 orig_num_lines = num_cache_lines;                if (num_cache_lines < 1) {                  printerr("Number of cache lines must be >= 1\n");                  exit(-1);                }	        break;        case 'l': log2_line_size = atoi(optarg);                 if (log2_line_size < 0) {                  printerr("Log base 2 of cache line length in bytes must be >= 0\n");                  exit(-1);                }	        break;        case 's': dostats = !dostats; 	        break;      case 't': test_result = !test_result; 	        break;      case 'o': doprint = !doprint; 	        break;      case 'h': printf("Usage: FFT <options>\n\n");                printf("options:\n");                printf("  -mM : M = even integer; 2**M total complex data points transformed.\n");                printf("  -pP : P = number of processors; Must be a power of 2.\n");                printf("  -nN : N = number of cache lines.\n");                printf("  -lL : L = Log base 2 of cache line length in bytes.\n");                printf("  -s  : Print individual processor timing statistics.\n");                printf("  -t  : Perform FFT and inverse FFT.  Test output by comparing the\n");                printf("        integral of the original data to the integral of the data that\n");                printf("        results from performing the FFT and inverse FFT.\n");                printf("  -o  : Print out complex data points.\n");                printf("  -h  : Print out command line options.\n\n");                printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n",                       DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE);		exit(0);	        break;    }  }*//***********************************************************************************************/  {;};/*Padding algorithm */  N = 1<<M; /*M = even integer; 2**M total complex data points transformed.*/  rootN = 1<<(M/2);  rowsperproc = rootN/P;/************************************************************************************************//*  无需对rowperproc值进行判断    if (rowsperproc == 0) {                                                                       printerr("Matrix not large enough. 2**(M/2) must be >= P\n");                                exit(-1);                                                                                     }*/	/************************************************************************************************/  line_size = 1 << log2_line_size;  if (line_size < 2*sizeof(double)) {    printf("WARNING: Each element is a complex double (%ld bytes)\n",2*sizeof(double));    printf("  => Less than one element per cache line\n");    printf("     Computing transpose blocking factor\n");    factor = (2*sizeof(double)) / line_size;    num_cache_lines = orig_num_lines / factor;  }    if (line_size <= 2*sizeof(double)) {    pad_length = 1;  } else {    pad_length = line_size / (2*sizeof(double));  }  if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) {    pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE;    if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) {      pages ++;    }    pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc);  } else {    pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) /                 (2 * sizeof(double) * rowsperproc);    if (pad_length * (2 * sizeof(double) * rowsperproc) !=        (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) {      printerr("Padding algorithm unsuccessful\n");      exit(-1);    }  }    /*使用Valloc函数为变量分配内存,地址为memory每页的边界*/  Global = (struct GlobalMemory *) valloc(sizeof(struct GlobalMemory));;  x = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);;  trans = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);;  umain = (double *) valloc(2*rootN*sizeof(double));;    umain2 = (double *) valloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);;  Global->transtimes = (long *) valloc(P*sizeof(long));;    Global->totaltimes = (long *) valloc(P*sizeof(long));;   /************************************************************************************************/ /*  无需进行判断    if (Global == NULL) {    printerr("Could not malloc memory for Global\n");    exit(-1);  } else if (x == NULL) {    printerr("Could not malloc memory for x\n");    exit(-1);  } else if (trans == NULL) {    printerr("Could not malloc memory for trans\n");    exit(-1);  } else if (umain == NULL) {    printerr("Could not malloc memory for umain\n");    exit(-1);  } else if (umain2 == NULL) {    printerr("Could not malloc memory for umain2\n");    exit(-1);  }*//************************************************************************************************/  x = (double *) (((unsigned long) x) + PAGE_SIZE - ((unsigned long) x) % PAGE_SIZE);  trans = (double *) (((unsigned long) trans) + PAGE_SIZE - ((unsigned long) trans) % PAGE_SIZE);  umain2 = (double *) (((unsigned long) umain2) + PAGE_SIZE - ((unsigned long) umain2) % PAGE_SIZE);/************************************************************************************************//* 内存分配算法的解释及改进算法   In order to optimize data distribution, the data structures x, trans,    and umain2 have been aligned so that each begins on a page boundary.    This ensures that the amount of padding calculated by the program is    such that each processor's partition ends on a page boundary, thus    ensuring that all data from these structures that are needed by a    processor can be allocated to its local memory *//* POSSIBLE ENHANCEMENT:  Here is where one might distribute the x,   trans, and umain2 data structures across physically distributed    memories as desired.      One way to place data is as follows:   double *base;   long i;   i = ((N/P)+(rootN/P)*pad_length)*2;   base = &(x[0]);   for (j=0;j<P;j++) {    Place all addresses x such that (base <= x < base+i) on node j    base += i;   }   The trans and umain2 data structures can be placed in a similar manner.   *//************************************************************************************************//************************************************************************************************//*  打印处理器配置信息,测试时无需打印输出  printf("\n");  printf("FFT with Blocking Transpose\n");  printf("   %ld Complex Doubles\n",N);  printf("   %ld Processors\n",P);    if (num_cache_lines != orig_num_lines) {    printf("   %ld Cache lines\n",orig_num_lines);    printf("   %ld Cache lines for blocking transpose\n",num_cache_lines);  } else {    printf("   %ld Cache lines\n",num_cache_lines);  }  printf("   %d Byte line size\n",(1 << log2_line_size));  printf("   %d Bytes per page\n",PAGE_SIZE);  printf("\n");*/ /************************************************************************************************/    {	pthread_barrier_init(&(Global->start), NULL, P);/*barrier初始化 */  };

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -