📄 fftspx_h.asm
字号:
* ============================================================================ *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* fftSPxSP *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void fftSPxSP_asm( *
* int N, float * ptr_x, float * ptr_w, float * ptr_y, *
* unsigned char brev[], int n_min, int offset, int n_max); *
* *
* N = length of fft in complex samples, power of 2 < 8192 *
* ptr_x = pointer to complex data input *
* ptr_w = pointer to complex twiddle factor (see below) *
* brev = pointer to bit reverse table containing 64 entries *
* n_min = smallest fft butterfly used in computation *
* used for decomposing fft into subffts, see notes *
* offset = index in complex samples of sub-fft from start of main fft*
* n_max = size of main fft in complex samples *
* *
* (See the C compiler reference guide.) *
* *
* DESCRIPTION *
* The benchmark performs a mixed radix forwards fft using *
* a special sequece of coefficients generated in the following *
* way: *
* *
* /* generate vector of twiddle factors for optimized algorithm */ *
* void tw_gen(float * w, int N) *
* { *
* int j, k; *
* double x_t, y_t, theta1, theta2, theta3; *
* const double PI = 3.141592654; *
* *
* for (j=1, k=0; j <= N>>2; j = j<<2) *
* { *
* for (i=0; i < N>>2; i+=j) *
* { *
* theta1 = 2*PI*i/N; *
* x_t = cos(theta1); *
* y_t = sin(theta1); *
* w[k] = (float)x_t; *
* w[k+1] = (float)y_t; *
* *
* theta2 = 4*PI*i/N; *
* x_t = cos(theta2); *
* y_t = sin(theta2); *
* w[k+2] = (float)x_t; *
* w[k+3] = (float)y_t; *
* *
* theta3 = 6*PI*i/N; *
* x_t = cos(theta3); *
* y_t = sin(theta3); *
* w[k+4] = (float)x_t; *
* w[k+5] = (float)y_t; *
* k+=6; *
* } *
* } *
* } *
* This redundent set of twiddle factors is size 2*N float samples. *
* The function is accurate to about 130dB of signal to noise ratio *
* to the DFT function below: *
* *
* void dft(int n, float x[], float y[]) *
* { *
* int k,i, index; *
* const float PI = 3.14159654; *
* float * p_x; *
* float arg, fx_0, fx_1, fy_0, fy_1, co, si; *
* *
* for(k = 0; k<n; k++) *
* { *
* p_x = x; *
* fy_0 = 0; *
* fy_1 = 0; *
* for(i=0; i<n; i++) *
* { *
* fx_0 = p_x[0]; *
* fx_1 = p_x[1]; *
* p_x += 2; *
* index = (i*k) % n; *
* arg = 2*PI*index/n; *
* co = cos(arg); *
* si = -sin(arg); *
* fy_0 += ((fx_0 * co) - (fx_1 * si)); *
* fy_1 += ((fx_1 * co) + (fx_0 * si)); *
* } *
* y[2*k] = fy_0; *
* y[2*k+1] = fy_1; *
* } *
* } *
* The function takes the table and input data and calculates the fft *
* producing the frequency domain data in the Y array. *
* As the fft allows every input point to effect every output point in *
* a cache based system such as the c6711, this causes cache thrashing.*
* This is mitigated by allowing the main fft of size N to be divided *
* into several steps, allowing as much data reuse as possible. *
* *
* For example the following function: *
* *
* fftSPxSP_asm(1024, &x_asm[0],&w[0],y_asm,brev,4, 0,1024); *
* *
* is equvalent to: *
* *
* fftSPxSP_asm(256, &x_asm[2*0], &w[2*768],y_asm,brev,4, 0,1024); *
* fftSPxSP_asm(256, &x_asm[2*256],&w[2*768],y_asm,brev,4, 256,1024); *
* fftSPxSP_asm(256, &x_asm[2*512],&w[2*768],y_asm,brev,4, 512,1024); *
* fftSPxSP_asm(256, &x_asm[2*768],&w[2*768],y_asm,brev,4, 768,1024); *
* *
* Notice how the 1st fft function is called on the entire 1K data set *
* it covers the 1st pass of the fft until the butterfly size is 256. *
* The following 4 ffts do 256 pt ffts 25% of the size. These continue *
* down to the end when the buttefly is of size 4. The use an index to *
* the main twiddle factor array of 0.75*2*N. This is because the *
* twiddle factor array is composed of successively decimated versions *
* of the main array. *
* *
* N not equal to a power of 4 can be used, i.e. 512. In this case to *
* decompose the fft the following would be needed : *
* *
* fftSPxSP_asm(512, &x_asm[0],&w[0],y_asm,brev,2, 0,512); *
* *
* is equvalent to: *
* *
* fftSPxSP_asm(128, &x_asm[2*0], &w[2*384],y_asm,brev,4, 0,512); *
* fftSPxSP_asm(128, &x_asm[2*128],&w[2*384],y_asm,brev,4, 128,512); *
* fftSPxSP_asm(128, &x_asm[2*256],&w[2*384],y_asm,brev,4, 256,512); *
* fftSPxSP_asm(128, &x_asm[2*384],&w[2*384],y_asm,brev,4, 384,512); *
* *
* The twiddle factor array is composed of log4(N) sets of twiddle *
* factors, (3/4)*N, (3/16)*N, (3/64)*N, etc. The index into this *
* array for each stage of the fft is calculated by summing these *
* indices up appropriately. *
* For multiple ffts they can share the same table by calling the small*
* ffts from further down in the twiddle factor array. In the same way *
* as the decomposition works for more data reuse. *
* *
* ASSUMPTIONS: *
* n must be a power of 2 and n >= 8 n <= 16384 points. *
* Complex time data x and twiddle facotrs w are aligned on double *
* word boundares. Real values are stored in even word positions and *
* imaginary values in odd positions. *
* *
* All data is in single precision floating point format. The complex *
* frequency data will be returned in linear order. *
* This code is interupti tolerant, interupts are disabled on entry to *
* the function and reanlbed on exit. *
* *
* MEMORY NOTE: *
* Configuration is BI ENDIAN either tools flag -me or none will give *
* the same results. No memory bank hits occur in this code. *
* *
* TECHNIQUES *
* A special sequence of coeffs. used as generated above *
* produces the fft. This collapses the inner 2 loops in the *
* taditional Burrus and Parks implementation Fortran Code. *
* *
* The revised FFT uses a redundant sequence of twiddle factors to *
* allow a linear access through the data. This linear access enables *
* data and instruction level parallelism. *
* The data produced by the fftSPxSP fft is in normal form, the *
* whole data array is written into a new output buffer. *
* *
* The fftSPxSP butterfly is bit reversed, i.e. the inner 2 points of *
* the butterfly are corssed over, this has the effect of making the *
* data come out in bit reversed rather than fftSPxSP digit reversed *
* order. This simplifies the last pass of the loop. ia simple table *
* is used to do the bit reversal out of place. *
* *
* unsigned char brev[64] = { *
* 0x0, 0x20, 0x10, 0x30, 0x8, 0x28, 0x18, 0x38, *
* 0x4, 0x24, 0x14, 0x34, 0xc, 0x2c, 0x1c, 0x3c, *
* 0x2, 0x22, 0x12, 0x32, 0xa, 0x2a, 0x1a, 0x3a, *
* 0x6, 0x26, 0x16, 0x36, 0xe, 0x2e, 0x1e, 0x3e, *
* 0x1, 0x21, 0x11, 0x31, 0x9, 0x29, 0x19, 0x39, *
* 0x5, 0x25, 0x15, 0x35, 0xd, 0x2d, 0x1d, 0x3d, *
* 0x3, 0x23, 0x13, 0x33, 0xb, 0x2b, 0x1b, 0x3b, *
* 0x7, 0x27, 0x17, 0x37, 0xf, 0x2f, 0x1f, 0x3f *
* }; *
* *
* C CODE *
* This is the C equivalent of the assembly code without restrictions:*
* Note that the assembly code is hand optimized and restrictions may *
* apply. *
* *
* *
* void fftSPxSP_co(int n, float ptr_x[], float ptr_w[], float ptr_y[], *
* unsigned char brev[], int n_min, int offset, int n_max) *
* { *
* int i, j, k, l1, l2, h2, predj; *
* int tw_offset, stride, fft_jmp; *
* *
* float x0, x1, x2, x3,x4,x5,x6,x7; *
* float xt0, yt0, xt1, yt1, xt2, yt2, yt3; *
* float yt4, yt5, yt6, yt7; *
* float si1,si2,si3,co1,co2,co3; *
* float xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21; *
* float x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1; *
* float xl0_0, xl1_0, xl0_1, xl1_1; *
* float xh0_0, xh1_0, xh0_1, xh1_1; *
* float *x,*w; *
* int k0, k1, j0, j1, l0, radix; *
* float * y0, * ptr_x0, * ptr_x2; *
* *
* radix = n_min; *
* *
* stride = n; /* n is the number of complex samples */ *
* tw_offset = 0; *
* while (stride > radix) *
* { *
* j = 0; *
* fft_jmp = stride + (stride>>1); *
* h2 = stride>>1; *
* l1 = stride; *
* l2 = stride + (stride>>1); *
* x = ptr_x; *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -