📄 impulse_fft_hw.c

📁 ImpulseC Codeveloper fft code. This file implements the hardware portion of a 256 sample FFT using a
💻 C
字号:
//////////////////////////////////////////////////////
//
// fft_hw.c
//
// Green Mountain Computing Systems, Inc.
// You may copy, use and modified this example.
//
// This file implements the hardware portion of a 256 sample FFT using a radix-4 algorithm.
//
// This implementation demonstrates that results similar to hand-coded HDL can be achieved
// using the C language, and without using a low-level style of C code.
//  
// This implementation takes advantage of hardware pipelining to compute one iteration per
// cycle without the need to specify how to divide up the computation and data over multiple
// cycles.  The implementation does require some insights into the hardware, such as the
// mapping of the sample data into four separate banks so that four samples per cycle can be
// accessed.  This type of manual optimization is not uncommon in parallel programming.
// 
//
// Performance
//   273 Cycles (1 butterfly/cycle + 14 cycle pipeline latency)
//
// Resource usage:
//
// There are three main optimizations used to accelerate the FFT algorithm:
//   1. The complex number arithmetic is automatically computed in parallel
//   2. The pipeline option is turned on to automatically parallelize loop iterations
//   3. The twiddle factors and samples are stored in multiple memory banks to allow
//      multiple loads and stores in a single cycle 
//
	
#ifdef WIN32
#include <windows.h>
#endif
#include <stdio.h>
#include "co.h"
#include "co_math.h"

void test_producer(co_stream datastream);
void test_consumer(co_stream datastream);

#define NUM_PTS 256

// define a complex number type
typedef struct cpx_s {
	float r,i;
} cpx_t;

//
// getBank(i) returns the bank number where the sample i is stored.
//
// The sample data is stored in four separate arrays in order to access four samples/cycle.
// The sample data must be carefully laid out in the arrays so that the four values needed
// in each iteration are always in different arrays.  The getBank macro maps the element
// number to an array/bank number (0-3) in a way that ensures this.
//
#define getBank(i) ((i^(i>>2)^(i>>4)^(i>>6))&0x3)

//
// select is used to assign res the correct value v0,..,v3 for the selected bank
//
#define select(res,mem,bank0,v0,bank1,v1,bank2,v2,bank3,v3)\
			if (bank0==mem)\
				res=v0;\
			else if (bank1==mem)\
				res=v1;\
			else if (bank2==mem)\
				res=v2;\
			else\
				res=v3

//
// fft256proc is the FFT implementation
//
// This procedure reads 256 complex number samples from the input stream, performs the
// fft computation and writes the 256 result values.
//

void fft256proc(co_stream sample_stream, co_stream result_stream)
{
	// Each array is implemented in its own memory bank.  By splitting the data into multiple
	// arrays, we are able to access many values in one cycle.
float twiddle1r[64]={1,0.999699,0.998795,0.99729,0.995185,0.99248,0.989177,0.985278,0.980785,0.975702,0.970031,0.963776,0.95694,0.949528,0.941544,0.932993,0.92388,0.91421,0.903989,0.893224,0.881921,0.870087,0.857729,0.844854,0.83147,0.817585,0.803208,0.788346,0.77301,0.757209,0.740951,0.724247,0.707107,0.689541,0.671559,0.653173,0.634393,0.615232,0.595699,0.575808,0.55557,0.534998,0.514103,0.492898,0.471397,0.449611,0.427555,0.405241,0.382683,0.359895,0.33689,0.313682,0.290285,0.266713,0.24298,0.219101,0.19509,0.170962,0.14673,0.122411,0.0980171,0.0735646,0.0490677,0.0245412};
float twiddle1i[64]={0,-0.0245412,-0.0490677,-0.0735646,-0.0980171,-0.122411,-0.14673,-0.170962,-0.19509,-0.219101,-0.24298,-0.266713,-0.290285,-0.313682,-0.33689,-0.359895,-0.382683,-0.405241,-0.427555,-0.449611,-0.471397,-0.492898,-0.514103,-0.534998,-0.55557,-0.575808,-0.595699,-0.615232,-0.634393,-0.653173,-0.671559,-0.689541,-0.707107,-0.724247,-0.740951,-0.757209,-0.77301,-0.788346,-0.803208,-0.817585,-0.83147,-0.844854,-0.857729,-0.870087,-0.881921,-0.893224,-0.903989,-0.91421,-0.92388,-0.932993,-0.941544,-0.949528,-0.95694,-0.963776,-0.970031,-0.975702,-0.980785,-0.985278,-0.989177,-0.99248,-0.995185,-0.99729,-0.998795,-0.999699};
float twiddle2r[64]={1,0.998795,0.995185,0.989177,0.980785,0.970031,0.95694,0.941544,0.92388,0.903989,0.881921,0.857729,0.83147,0.803208,0.77301,0.740951,0.707107,0.671559,0.634393,0.595699,0.55557,0.514103,0.471397,0.427555,0.382683,0.33689,0.290285,0.24298,0.19509,0.14673,0.0980171,0.0490677,6.12303e-017,-0.0490677,-0.0980171,-0.14673,-0.19509,-0.24298,-0.290285,-0.33689,-0.382683,-0.427555,-0.471397,-0.514103,-0.55557,-0.595699,-0.634393,-0.671559,-0.707107,-0.740951,-0.77301,-0.803208,-0.83147,-0.857729,-0.881921,-0.903989,-0.92388,-0.941544,-0.95694,-0.970031,-0.980785,-0.989177,-0.995185,-0.998795};
float twiddle2i[64]={0,-0.0490677,-0.0980171,-0.14673,-0.19509,-0.24298,-0.290285,-0.33689,-0.382683,-0.427555,-0.471397,-0.514103,-0.55557,-0.595699,-0.634393,-0.671559,-0.707107,-0.740951,-0.77301,-0.803208,-0.83147,-0.857729,-0.881921,-0.903989,-0.92388,-0.941544,-0.95694,-0.970031,-0.980785,-0.989177,-0.995185,-0.998795,-1,-0.998795,-0.995185,-0.989177,-0.980785,-0.970031,-0.95694,-0.941544,-0.92388,-0.903989,-0.881921,-0.857729,-0.83147,-0.803208,-0.77301,-0.740951,-0.707107,-0.671559,-0.634393,-0.595699,-0.55557,-0.514103,-0.471397,-0.427555,-0.382683,-0.33689,-0.290285,-0.24298,-0.19509,-0.14673,-0.0980171,-0.0490677};
float twiddle3r[64]={1,0.99729,0.989177,0.975702,0.95694,0.932993,0.903989,0.870087,0.83147,0.788346,0.740951,0.689541,0.634393,0.575808,0.514103,0.449611,0.382683,0.313682,0.24298,0.170962,0.0980171,0.0245412,-0.0490677,-0.122411,-0.19509,-0.266713,-0.33689,-0.405241,-0.471397,-0.534998,-0.595699,-0.653173,-0.707107,-0.757209,-0.803208,-0.844854,-0.881921,-0.91421,-0.941544,-0.963776,-0.980785,-0.99248,-0.998795,-0.999699,-0.995185,-0.985278,-0.970031,-0.949528,-0.92388,-0.893224,-0.857729,-0.817585,-0.77301,-0.724247,-0.671559,-0.615232,-0.55557,-0.492898,-0.427555,-0.359895,-0.290285,-0.219101,-0.14673,-0.0735646};
float twiddle3i[64]={0,-0.0735646,-0.14673,-0.219101,-0.290285,-0.359895,-0.427555,-0.492898,-0.55557,-0.615232,-0.671559,-0.724247,-0.77301,-0.817585,-0.857729,-0.893224,-0.92388,-0.949528,-0.970031,-0.985278,-0.995185,-0.999699,-0.998795,-0.99248,-0.980785,-0.963776,-0.941544,-0.91421,-0.881921,-0.844854,-0.803208,-0.757209,-0.707107,-0.653173,-0.595699,-0.534998,-0.471397,-0.405241,-0.33689,-0.266713,-0.19509,-0.122411,-0.0490677,0.0245412,0.0980171,0.170962,0.24298,0.313682,0.382683,0.449611,0.514103,0.575808,0.634393,0.689541,0.740951,0.788346,0.83147,0.870087,0.903989,0.932993,0.95694,0.975702,0.989177,0.99729};

	float data0r[NUM_PTS/4];
	float data1r[NUM_PTS/4];
	float data2r[NUM_PTS/4];
	float data3r[NUM_PTS/4];
	float data0i[NUM_PTS/4];
	float data1i[NUM_PTS/4];
	float data2i[NUM_PTS/4];
	float data3i[NUM_PTS/4];
	cpx_t mem0,mem1,mem2,mem3;

	co_uint8 i,m,j,j0,j1,j2,j3,inc,t,t0,tinc;
	co_uint16 axis;

	float sr,si;
	cpx_t p1,p2,p3;
	cpx_t c0,c1,c2,c3;
	cpx_t d0,d1,d2,d3;

	co_uint2 bank0,bank1,bank2,bank3;
	co_uint8 loc0,loc1,loc2,loc3;

#pragma CO set defaultDelay 64

	do {
	/* reorder data as it comes in for inplace fft computation */
	co_stream_open(result_stream, O_WRONLY, INT_TYPE(32));
	co_stream_open(sample_stream, O_RDONLY, INT_TYPE(32));
	i=0;
	do {
#pragma CO PIPELINE
		co_stream_read(sample_stream,&sr,sizeof(float));
		co_stream_read(sample_stream,&si,sizeof(float));
		// this does the reordering
		j0=((i>>6)|((i>>2)&0x0c)|((i<<2)&0x30)|((i<<6)&0xc0));
		// store it in the assigned bank
		bank0=getBank(j0);
		if (bank0==0) {
			data0r[j0>>2]=sr;
			data0i[j0>>2]=si;
		}
		if (bank0==1) {
			data1r[j0>>2]=sr;
			data1i[j0>>2]=si;
		}
		if (bank0==2) {
			data2r[j0>>2]=sr;
			data2i[j0>>2]=si;
		}
		if (bank0==3) {
			data3r[j0>>2]=sr;
			data3i[j0>>2]=si;
		}
		i++;
	} while (i!=0);

	//
	// compute 4 stages of 64 butterflies in one 4x64 loop
	//
	// By flattening the FFT algorithm into a single loop, we are able to use the pipeline
	// option enabling us to compute 1 butterfly/cycle
	//

	// these parameters are used to compute the appropriate indexing for the current stage
	m=1;
	j=0;
	axis=0x00ff;
	inc=4;
	t=0;
	tinc=0x40;
	i=0;
	do {
#pragma CO PIPELINE
// The nonrecursive pragma forces the optimizer to ignore "recursive"
// variable usage for these variables. See the Impulse C documentation.
// In this example, all access to values in data0r .. data3i are at least 16 iterations old
// so as long as the latency does not exceed 16, we can declare them nonrecursive, although
// technically they are not.
#pragma CO nonrecursive data0r
#pragma CO nonrecursive data1r
#pragma CO nonrecursive data2r
#pragma CO nonrecursive data3r
#pragma CO nonrecursive data0i
#pragma CO nonrecursive data1i
#pragma CO nonrecursive data2i
#pragma CO nonrecursive data3i
		// compute the addresses of the four samples required for this butterfly
		j0=j;
		j1=j+m;
		j2=j+(m<<1);
		j3=j2+m;
		t0=t;

		// prep next butterfly
		t=(t+tinc)&0x3f;
		if ((i&(axis>>8))==(axis>>8))
			j+=inc;
		else
			j++;
		i++;
		if ((i&0x3f)==0) {
			// compute parameters for the next stage
			tinc=tinc>>2;
			m=m<<2;
			axis=axis<<2;
			inc=(m<<2)-(axis>>8);
		}

		//
		// retrieve the four samples from memory
		//

		bank0=getBank(j0);
		bank1=getBank(j1);
		bank2=getBank(j2);
		bank3=getBank(j3);

		// select the address for each array according to the bank assignments
		select(loc0,0,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
		select(loc1,1,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
		select(loc2,2,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
		select(loc3,3,bank0,j0,bank1,j1,bank2,j2,bank3,j3);

		// get the data
		mem0.r=data0r[loc0>>2]; mem0.i=data0i[loc0>>2];
		mem1.r=data1r[loc1>>2]; mem1.i=data1i[loc1>>2];
		mem2.r=data2r[loc2>>2]; mem2.i=data2i[loc2>>2];
		mem3.r=data3r[loc3>>2]; mem3.i=data3i[loc3>>2];

		// select the sample for each d0,..,d3 according to the bank assignments
		select(d0,0,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
		select(d1,1,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
		select(d2,2,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
		select(d3,3,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);

		//
		// radix-4 butterfly
		//
		c1.r=twiddle1r[t0]; c1.i=twiddle1i[t0];
		c2.r=twiddle2r[t0]; c2.i=twiddle2i[t0];
		c3.r=twiddle3r[t0]; c3.i=twiddle3i[t0];

		p1.r=c1.r*d1.r-c1.i*d1.i;
		p1.i=c1.r*d1.i+c1.i*d1.r;
		p2.r=c2.r*d2.r-c2.i*d2.i;
		p2.i=c2.r*d2.i+c2.i*d2.r;
		p3.r=c3.r*d3.r-c3.i*d3.i;
		p3.i=c3.r*d3.i+c3.i*d3.r;

		c0.r=d0.r+p2.r; // c0=d0+twiddle2*d2
		c0.i=d0.i+p2.i;
		c1.r=p1.r+p3.r; // c1=twiddle1*d1+twiddle3*d3
		c1.i=p1.i+p3.i;
		c2.r=d0.r-p2.r; // c2=d0-twiddle2*d2
		c2.i=d0.i-p2.i;
		c3.r=p3.i-p1.i; // c3=j*(tiddle1*d1-twiddle3*d3)
		c3.i=p1.r-p3.r;
		
		d0.r=c0.r+c1.r; // d0=d0+twiddle2*d2+twiddle1*d1+twiddle3*d3
		d0.i=c0.i+c1.i;
		d1.r=c2.r-c3.r; // d1=d0-twiddle2*d2-j*(tiddle1*d1-twiddle3*d3)
		d1.i=c2.i-c3.i;
		d2.r=c0.r-c1.r; // d2=d0+twiddle2*d2-twiddle1*d1-twiddle3*d3
		d2.i=c0.i-c1.i;
		d3.r=c2.r+c3.r; // d3=d0-twiddle2*d2+j*(tiddle1*d1-twiddle3*d3)
		d3.i=c2.i+c3.i;

		// select the output data for each memory according to the bank assignments
		select(mem0,0,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
		select(mem1,1,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
		select(mem2,2,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
		select(mem3,3,bank0,d0,bank1,d1,bank2,d2,bank3,d3);

		// store the data
		data0r[loc0>>2]=mem0.r;
		data0i[loc0>>2]=mem0.i;
		data1r[loc1>>2]=mem1.r;
		data1i[loc1>>2]=mem1.i;
		data2r[loc2>>2]=mem2.r;
		data2i[loc2>>2]=mem2.i;
		data3r[loc3>>2]=mem3.r;
		data3i[loc3>>2]=mem3.i;

	} while (i!=0);

	// stream out the data
	j0=0;
	do {
#pragma CO PIPELINE
		bank0=getBank(j0);
		if (bank0==0) {
			sr=data0r[j0>>2];
			si=data0i[j0>>2];
		}
		if (bank0==1) {
			sr=data1r[j0>>2];
			si=data1i[j0>>2];
		}
		if (bank0==2) {
			sr=data2r[j0>>2];
			si=data2i[j0>>2];
		}
		if (bank0==3) {
			sr=data3r[j0>>2];
			si=data3i[j0>>2];
		}
		co_stream_write(result_stream,&sr,sizeof(float));
		co_stream_write(result_stream,&si,sizeof(float));
        j0++;
	} while (j0!=0);

    co_stream_close(sample_stream);
    co_stream_close(result_stream);
	IF_SIM(break;)

	} while (1);
}

void config_fft(void *arg)
{
	co_stream sample_stream;
	co_stream result_stream;

	co_process producer_process;
	co_process fft_process;
	co_process consumer_process;

	sample_stream = co_stream_create( "sample_stream", INT_TYPE(32), 1 ); 
	result_stream  = co_stream_create( "result_stream", INT_TYPE(32), 1 );

	producer_process = co_process_create("producer_process", (co_function)test_producer,
                                         1,sample_stream);

	fft_process = co_process_create("fft_process", (co_function)fft256proc,
                                    2,sample_stream,result_stream);

	consumer_process = co_process_create("consumer_process",(co_function)test_consumer,
                                         1,result_stream);

	co_process_config(fft_process, co_loc, "PE0");  
}

co_architecture co_initialize(void *arg)
{
	return(co_architecture_create("fft256","VHDL",config_fft,(void *)arg));
}
💿 文件大小 5 K
👤 上传用户 c_word
📂 所属分类软件设计/软件工程
📄 代码行数 335 行
💻 语言类型 C语言
🏷️ 相关标签

#Codeveloper #implements #ImpulseC #hardware
更多Codeveloper资源 →
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -