📄 impulse_fft_hw.c
字号:
//////////////////////////////////////////////////////
//
// fft_hw.c
//
// Green Mountain Computing Systems, Inc.
// You may copy, use and modified this example.
//
// This file implements the hardware portion of a 256 sample FFT using a radix-4 algorithm.
//
// This implementation demonstrates that results similar to hand-coded HDL can be achieved
// using the C language, and without using a low-level style of C code.
//
// This implementation takes advantage of hardware pipelining to compute one iteration per
// cycle without the need to specify how to divide up the computation and data over multiple
// cycles. The implementation does require some insights into the hardware, such as the
// mapping of the sample data into four separate banks so that four samples per cycle can be
// accessed. This type of manual optimization is not uncommon in parallel programming.
//
//
// Performance
// 273 Cycles (1 butterfly/cycle + 14 cycle pipeline latency)
//
// Resource usage:
//
// There are three main optimizations used to accelerate the FFT algorithm:
// 1. The complex number arithmetic is automatically computed in parallel
// 2. The pipeline option is turned on to automatically parallelize loop iterations
// 3. The twiddle factors and samples are stored in multiple memory banks to allow
// multiple loads and stores in a single cycle
//
#ifdef WIN32
#include <windows.h>
#endif
#include <stdio.h>
#include "co.h"
#include "co_math.h"
void test_producer(co_stream datastream);
void test_consumer(co_stream datastream);
#define NUM_PTS 256
// define a complex number type
typedef struct cpx_s {
float r,i;
} cpx_t;
//
// getBank(i) returns the bank number where the sample i is stored.
//
// The sample data is stored in four separate arrays in order to access four samples/cycle.
// The sample data must be carefully laid out in the arrays so that the four values needed
// in each iteration are always in different arrays. The getBank macro maps the element
// number to an array/bank number (0-3) in a way that ensures this.
//
#define getBank(i) ((i^(i>>2)^(i>>4)^(i>>6))&0x3)
//
// select is used to assign res the correct value v0,..,v3 for the selected bank
//
#define select(res,mem,bank0,v0,bank1,v1,bank2,v2,bank3,v3)\
if (bank0==mem)\
res=v0;\
else if (bank1==mem)\
res=v1;\
else if (bank2==mem)\
res=v2;\
else\
res=v3
//
// fft256proc is the FFT implementation
//
// This procedure reads 256 complex number samples from the input stream, performs the
// fft computation and writes the 256 result values.
//
void fft256proc(co_stream sample_stream, co_stream result_stream)
{
// Each array is implemented in its own memory bank. By splitting the data into multiple
// arrays, we are able to access many values in one cycle.
float twiddle1r[64]={1,0.999699,0.998795,0.99729,0.995185,0.99248,0.989177,0.985278,0.980785,0.975702,0.970031,0.963776,0.95694,0.949528,0.941544,0.932993,0.92388,0.91421,0.903989,0.893224,0.881921,0.870087,0.857729,0.844854,0.83147,0.817585,0.803208,0.788346,0.77301,0.757209,0.740951,0.724247,0.707107,0.689541,0.671559,0.653173,0.634393,0.615232,0.595699,0.575808,0.55557,0.534998,0.514103,0.492898,0.471397,0.449611,0.427555,0.405241,0.382683,0.359895,0.33689,0.313682,0.290285,0.266713,0.24298,0.219101,0.19509,0.170962,0.14673,0.122411,0.0980171,0.0735646,0.0490677,0.0245412};
float twiddle1i[64]={0,-0.0245412,-0.0490677,-0.0735646,-0.0980171,-0.122411,-0.14673,-0.170962,-0.19509,-0.219101,-0.24298,-0.266713,-0.290285,-0.313682,-0.33689,-0.359895,-0.382683,-0.405241,-0.427555,-0.449611,-0.471397,-0.492898,-0.514103,-0.534998,-0.55557,-0.575808,-0.595699,-0.615232,-0.634393,-0.653173,-0.671559,-0.689541,-0.707107,-0.724247,-0.740951,-0.757209,-0.77301,-0.788346,-0.803208,-0.817585,-0.83147,-0.844854,-0.857729,-0.870087,-0.881921,-0.893224,-0.903989,-0.91421,-0.92388,-0.932993,-0.941544,-0.949528,-0.95694,-0.963776,-0.970031,-0.975702,-0.980785,-0.985278,-0.989177,-0.99248,-0.995185,-0.99729,-0.998795,-0.999699};
float twiddle2r[64]={1,0.998795,0.995185,0.989177,0.980785,0.970031,0.95694,0.941544,0.92388,0.903989,0.881921,0.857729,0.83147,0.803208,0.77301,0.740951,0.707107,0.671559,0.634393,0.595699,0.55557,0.514103,0.471397,0.427555,0.382683,0.33689,0.290285,0.24298,0.19509,0.14673,0.0980171,0.0490677,6.12303e-017,-0.0490677,-0.0980171,-0.14673,-0.19509,-0.24298,-0.290285,-0.33689,-0.382683,-0.427555,-0.471397,-0.514103,-0.55557,-0.595699,-0.634393,-0.671559,-0.707107,-0.740951,-0.77301,-0.803208,-0.83147,-0.857729,-0.881921,-0.903989,-0.92388,-0.941544,-0.95694,-0.970031,-0.980785,-0.989177,-0.995185,-0.998795};
float twiddle2i[64]={0,-0.0490677,-0.0980171,-0.14673,-0.19509,-0.24298,-0.290285,-0.33689,-0.382683,-0.427555,-0.471397,-0.514103,-0.55557,-0.595699,-0.634393,-0.671559,-0.707107,-0.740951,-0.77301,-0.803208,-0.83147,-0.857729,-0.881921,-0.903989,-0.92388,-0.941544,-0.95694,-0.970031,-0.980785,-0.989177,-0.995185,-0.998795,-1,-0.998795,-0.995185,-0.989177,-0.980785,-0.970031,-0.95694,-0.941544,-0.92388,-0.903989,-0.881921,-0.857729,-0.83147,-0.803208,-0.77301,-0.740951,-0.707107,-0.671559,-0.634393,-0.595699,-0.55557,-0.514103,-0.471397,-0.427555,-0.382683,-0.33689,-0.290285,-0.24298,-0.19509,-0.14673,-0.0980171,-0.0490677};
float twiddle3r[64]={1,0.99729,0.989177,0.975702,0.95694,0.932993,0.903989,0.870087,0.83147,0.788346,0.740951,0.689541,0.634393,0.575808,0.514103,0.449611,0.382683,0.313682,0.24298,0.170962,0.0980171,0.0245412,-0.0490677,-0.122411,-0.19509,-0.266713,-0.33689,-0.405241,-0.471397,-0.534998,-0.595699,-0.653173,-0.707107,-0.757209,-0.803208,-0.844854,-0.881921,-0.91421,-0.941544,-0.963776,-0.980785,-0.99248,-0.998795,-0.999699,-0.995185,-0.985278,-0.970031,-0.949528,-0.92388,-0.893224,-0.857729,-0.817585,-0.77301,-0.724247,-0.671559,-0.615232,-0.55557,-0.492898,-0.427555,-0.359895,-0.290285,-0.219101,-0.14673,-0.0735646};
float twiddle3i[64]={0,-0.0735646,-0.14673,-0.219101,-0.290285,-0.359895,-0.427555,-0.492898,-0.55557,-0.615232,-0.671559,-0.724247,-0.77301,-0.817585,-0.857729,-0.893224,-0.92388,-0.949528,-0.970031,-0.985278,-0.995185,-0.999699,-0.998795,-0.99248,-0.980785,-0.963776,-0.941544,-0.91421,-0.881921,-0.844854,-0.803208,-0.757209,-0.707107,-0.653173,-0.595699,-0.534998,-0.471397,-0.405241,-0.33689,-0.266713,-0.19509,-0.122411,-0.0490677,0.0245412,0.0980171,0.170962,0.24298,0.313682,0.382683,0.449611,0.514103,0.575808,0.634393,0.689541,0.740951,0.788346,0.83147,0.870087,0.903989,0.932993,0.95694,0.975702,0.989177,0.99729};
float data0r[NUM_PTS/4];
float data1r[NUM_PTS/4];
float data2r[NUM_PTS/4];
float data3r[NUM_PTS/4];
float data0i[NUM_PTS/4];
float data1i[NUM_PTS/4];
float data2i[NUM_PTS/4];
float data3i[NUM_PTS/4];
cpx_t mem0,mem1,mem2,mem3;
co_uint8 i,m,j,j0,j1,j2,j3,inc,t,t0,tinc;
co_uint16 axis;
float sr,si;
cpx_t p1,p2,p3;
cpx_t c0,c1,c2,c3;
cpx_t d0,d1,d2,d3;
co_uint2 bank0,bank1,bank2,bank3;
co_uint8 loc0,loc1,loc2,loc3;
#pragma CO set defaultDelay 64
do {
/* reorder data as it comes in for inplace fft computation */
co_stream_open(result_stream, O_WRONLY, INT_TYPE(32));
co_stream_open(sample_stream, O_RDONLY, INT_TYPE(32));
i=0;
do {
#pragma CO PIPELINE
co_stream_read(sample_stream,&sr,sizeof(float));
co_stream_read(sample_stream,&si,sizeof(float));
// this does the reordering
j0=((i>>6)|((i>>2)&0x0c)|((i<<2)&0x30)|((i<<6)&0xc0));
// store it in the assigned bank
bank0=getBank(j0);
if (bank0==0) {
data0r[j0>>2]=sr;
data0i[j0>>2]=si;
}
if (bank0==1) {
data1r[j0>>2]=sr;
data1i[j0>>2]=si;
}
if (bank0==2) {
data2r[j0>>2]=sr;
data2i[j0>>2]=si;
}
if (bank0==3) {
data3r[j0>>2]=sr;
data3i[j0>>2]=si;
}
i++;
} while (i!=0);
//
// compute 4 stages of 64 butterflies in one 4x64 loop
//
// By flattening the FFT algorithm into a single loop, we are able to use the pipeline
// option enabling us to compute 1 butterfly/cycle
//
// these parameters are used to compute the appropriate indexing for the current stage
m=1;
j=0;
axis=0x00ff;
inc=4;
t=0;
tinc=0x40;
i=0;
do {
#pragma CO PIPELINE
// The nonrecursive pragma forces the optimizer to ignore "recursive"
// variable usage for these variables. See the Impulse C documentation.
// In this example, all access to values in data0r .. data3i are at least 16 iterations old
// so as long as the latency does not exceed 16, we can declare them nonrecursive, although
// technically they are not.
#pragma CO nonrecursive data0r
#pragma CO nonrecursive data1r
#pragma CO nonrecursive data2r
#pragma CO nonrecursive data3r
#pragma CO nonrecursive data0i
#pragma CO nonrecursive data1i
#pragma CO nonrecursive data2i
#pragma CO nonrecursive data3i
// compute the addresses of the four samples required for this butterfly
j0=j;
j1=j+m;
j2=j+(m<<1);
j3=j2+m;
t0=t;
// prep next butterfly
t=(t+tinc)&0x3f;
if ((i&(axis>>8))==(axis>>8))
j+=inc;
else
j++;
i++;
if ((i&0x3f)==0) {
// compute parameters for the next stage
tinc=tinc>>2;
m=m<<2;
axis=axis<<2;
inc=(m<<2)-(axis>>8);
}
//
// retrieve the four samples from memory
//
bank0=getBank(j0);
bank1=getBank(j1);
bank2=getBank(j2);
bank3=getBank(j3);
// select the address for each array according to the bank assignments
select(loc0,0,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
select(loc1,1,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
select(loc2,2,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
select(loc3,3,bank0,j0,bank1,j1,bank2,j2,bank3,j3);
// get the data
mem0.r=data0r[loc0>>2]; mem0.i=data0i[loc0>>2];
mem1.r=data1r[loc1>>2]; mem1.i=data1i[loc1>>2];
mem2.r=data2r[loc2>>2]; mem2.i=data2i[loc2>>2];
mem3.r=data3r[loc3>>2]; mem3.i=data3i[loc3>>2];
// select the sample for each d0,..,d3 according to the bank assignments
select(d0,0,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
select(d1,1,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
select(d2,2,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
select(d3,3,bank0,mem0,bank1,mem1,bank2,mem2,bank3,mem3);
//
// radix-4 butterfly
//
c1.r=twiddle1r[t0]; c1.i=twiddle1i[t0];
c2.r=twiddle2r[t0]; c2.i=twiddle2i[t0];
c3.r=twiddle3r[t0]; c3.i=twiddle3i[t0];
p1.r=c1.r*d1.r-c1.i*d1.i;
p1.i=c1.r*d1.i+c1.i*d1.r;
p2.r=c2.r*d2.r-c2.i*d2.i;
p2.i=c2.r*d2.i+c2.i*d2.r;
p3.r=c3.r*d3.r-c3.i*d3.i;
p3.i=c3.r*d3.i+c3.i*d3.r;
c0.r=d0.r+p2.r; // c0=d0+twiddle2*d2
c0.i=d0.i+p2.i;
c1.r=p1.r+p3.r; // c1=twiddle1*d1+twiddle3*d3
c1.i=p1.i+p3.i;
c2.r=d0.r-p2.r; // c2=d0-twiddle2*d2
c2.i=d0.i-p2.i;
c3.r=p3.i-p1.i; // c3=j*(tiddle1*d1-twiddle3*d3)
c3.i=p1.r-p3.r;
d0.r=c0.r+c1.r; // d0=d0+twiddle2*d2+twiddle1*d1+twiddle3*d3
d0.i=c0.i+c1.i;
d1.r=c2.r-c3.r; // d1=d0-twiddle2*d2-j*(tiddle1*d1-twiddle3*d3)
d1.i=c2.i-c3.i;
d2.r=c0.r-c1.r; // d2=d0+twiddle2*d2-twiddle1*d1-twiddle3*d3
d2.i=c0.i-c1.i;
d3.r=c2.r+c3.r; // d3=d0-twiddle2*d2+j*(tiddle1*d1-twiddle3*d3)
d3.i=c2.i+c3.i;
// select the output data for each memory according to the bank assignments
select(mem0,0,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
select(mem1,1,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
select(mem2,2,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
select(mem3,3,bank0,d0,bank1,d1,bank2,d2,bank3,d3);
// store the data
data0r[loc0>>2]=mem0.r;
data0i[loc0>>2]=mem0.i;
data1r[loc1>>2]=mem1.r;
data1i[loc1>>2]=mem1.i;
data2r[loc2>>2]=mem2.r;
data2i[loc2>>2]=mem2.i;
data3r[loc3>>2]=mem3.r;
data3i[loc3>>2]=mem3.i;
} while (i!=0);
// stream out the data
j0=0;
do {
#pragma CO PIPELINE
bank0=getBank(j0);
if (bank0==0) {
sr=data0r[j0>>2];
si=data0i[j0>>2];
}
if (bank0==1) {
sr=data1r[j0>>2];
si=data1i[j0>>2];
}
if (bank0==2) {
sr=data2r[j0>>2];
si=data2i[j0>>2];
}
if (bank0==3) {
sr=data3r[j0>>2];
si=data3i[j0>>2];
}
co_stream_write(result_stream,&sr,sizeof(float));
co_stream_write(result_stream,&si,sizeof(float));
j0++;
} while (j0!=0);
co_stream_close(sample_stream);
co_stream_close(result_stream);
IF_SIM(break;)
} while (1);
}
void config_fft(void *arg)
{
co_stream sample_stream;
co_stream result_stream;
co_process producer_process;
co_process fft_process;
co_process consumer_process;
sample_stream = co_stream_create( "sample_stream", INT_TYPE(32), 1 );
result_stream = co_stream_create( "result_stream", INT_TYPE(32), 1 );
producer_process = co_process_create("producer_process", (co_function)test_producer,
1,sample_stream);
fft_process = co_process_create("fft_process", (co_function)fft256proc,
2,sample_stream,result_stream);
consumer_process = co_process_create("consumer_process",(co_function)test_consumer,
1,result_stream);
co_process_config(fft_process, co_loc, "PE0");
}
co_architecture co_initialize(void *arg)
{
return(co_architecture_create("fft256","VHDL",config_fft,(void *)arg));
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -