dsp_fft.h
来自「dm642函数库」· C头文件 代码 · 共 535 行 · 第 1/3 页
H
535 行
/* xh20 = x[2 * i1 ] + x[2 * i3 ]; */
/* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; */
/* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
/* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
/* */
/* x[2 * i0 ] = xh0 + xh20; */
/* x[2 * i0 + 1] = xh1 + xh21; */
/* */
/* xt0 = xh0 - xh20; */
/* yt0 = xh1 - xh21; */
/* xt1 = xl0 + xl21; */
/* yt2 = xl1 + xl20; */
/* xt2 = xl0 - xl21; */
/* yt1 = xl1 - xl20; */
/* */
/* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; */
/* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; */
/* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; */
/* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; */
/* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; */
/* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; */
/* } */
/* } */
/* */
/* ie <<= 2; */
/* } */
/* } */
/* */
/* The conventional Cooley Tukey FFT, is written using three loops. */
/* The outermost loop "k" cycles through the stages. There are log */
/* N to the base 4 stages in all. The loop "j" cycles through the */
/* groups of butterflies with different twiddle factors, loop "i" */
/* reuses the twiddle factors for the different butterflies within */
/* a stage. It is interesting to note the following: */
/* */
/*-------------------------------------------------------------------------S*/
/* Stage# #Groups # Butterflies with common #Groups*Bflys */
/* twiddle factors */
/*-------------------------------------------------------------------------S*/
/* 1 N/4 1 N/4 */
/* 2 N/16 4 N/4 */
/* .. */
/* logN 1 N/4 N/4 */
/*-------------------------------------------------------------------------S*/
/* */
/* The following statements can be made based on above observations: */
/* */
/* a) Inner loop "i0" iterates a veriable number of times. In */
/* particular the number of iterations quadruples every time from */
/* 1..N/4. Hence software pipelining a loop that iterates a vraiable */
/* number of times is not profitable. */
/* */
/* b) Outer loop "j" iterates a variable number of times as well. */
/* However the number of iterations is quartered every time from */
/* N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite */
/* to each other. */
/* */
/* c) If the two loops "i" and "j" are colaesced together then they */
/* will iterate for a fixed number of times namely N/4. This allows */
/* us to combine the "i" and "j" loops into 1 loop. Optimized impl- */
/* ementations will make use of this fact. */
/* */
/* In addition the Cooley Tukey FFT accesses three twiddle factors */
/* per iteration of the inner loop, as the butterflies that re-use */
/* twiddle factors are lumped together. This leads to accessing the */
/* twiddle factor array at three points each sepearted by "ie". Note */
/* that "ie" is initially 1, and is quadrupled with every iteration. */
/* Therfore these three twiddle factors are not even contiguous in */
/* the array. */
/* */
/* In order to vectorize the FFT, it is desirable to access twiddle */
/* factor array using double word wide loads and fetch the twiddle */
/* factors needed. In order to do this a modified twiddle factor */
/* array is created, in which the factors WN/4, WN/2, W3N/4 are */
/* arranged to be contiguous. This eliminates the seperation between */
/* twiddle factors within a butterfly. However this implies that as */
/* the loop is traversed from one stage to another, that we maintain */
/* a redundant version of the twiddle factor array. Hence the size */
/* of the twiddle factor array increases as compared to the normal */
/* Cooley Tukey FFT. The modified twiddle factor array is of size */
/* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */
/* where N is the number of complex points to be transformed. The */
/* routine that generates the modified twiddle factor array was */
/* presented earlier. With the above transformation of the FFT, */
/* both the input data and the twiddle factor array can be accessed */
/* using double-word wide loads to enable packed data processing. */
/* */
/* The final stage is optimised to remove the multiplication as */
/* w0 = 1. This stage also performs digit reversal on the data, */
/* so the final output is in natural order. */
/* */
/* The DSP_fft() code shown here performs the bulk of the computation */
/* in place. However, because digit-reversal cannot be performed */
/* in-place, the final result is written to a separate array, y[]. */
/* */
/* There is one slight break in the flow of packed processing that */
/* needs to be comprehended. The real part of the complex number is */
/* in the lower half, and the imaginary part is in the upper half. */
/* The flow breaks in case of "xl0" and "xl1" because in this case */
/* the real part needs to be combined with the imaginary part because */
/* of the multiplication by "j". This requires a packed quantity like */
/* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined */
/* using add2's and sub2's. Hence the natural version of C code */
/* shown below is transformed using packed data processing as shown: */
/* */
/* xl0 = x[2 * i0 ] - x[2 * i2 ]; */
/* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
/* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
/* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
/* */
/* xt1 = xl0 + xl21; */
/* yt2 = xl1 + xl20; */
/* xt2 = xl0 - xl21; */
/* yt1 = xl1 - xl20; */
/* */
/* xl1_xl0 = _sub2(x21_x20, x21_x20) */
/* xl21_xl20 = _sub2(x32_x22, x23_x22) */
/* xl20_xl21 = _rotl(xl21_xl20, 16) */
/* */
/* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) */
/* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) */
/* */
/* Also notice that xt1, yt1 endup on seperate words, these need to */
/* be packed together to take advantage of the packed twiddle fact */
/* ors that have been loaded. In order for this to be achieved they */
/* are re-aligned as follows: */
/* */
/* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) */
/* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) */
/* */
/* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor */
/* to be used for the complex multiplies. The real part os the */
/* complex multiply is implemented using _dotp2. The imaginary */
/* part of the complex multiply is implemented using _dotpn2 */
/* after the twiddle factors are swizzled within the half word. */
/* */
/* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). */
/* */
/* The actual twiddle factors for the FFT are cosine, - sine. The */
/* twiddle factors stored in the table are csine and sine, hence */
/* the sign of the "sine" term is comprehended during multipli- */
/* cation as shown above. */
/* */
/* CYCLES */
/* cycles = 1.25*nsamp*log4(nsamp) - 0.5*nsamp + 23*log4(nsamp) - 1 */
/* */
/* For nsamp = 1024, cycles = 6002 */
/* For nsamp = 256, cycles = 1243 */
/* For nsamp = 64, cycles = 276 */
/* */
/* CODESIZE */
/* 988 bytes */
/* */
/* C CODE */
/* This is the C equivalent of the assembly code without restrictions: */
/* Note that the assembly code is hand optimized and restrictions may */
/* apply. */
/* */
/* void DSP_fft(short *ptr_w, int n, short *ptr_x, short *ptr_y) */
/* { */
/* int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp; */
/* short xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0; */
/* short xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1; */
/* short xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0; */
/* short xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1; */
/* short x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0: */
/* short x_10, x_11, x_12, x_13, x_14, x_15, x_16, x_17, x_l2_1, x_h2_3; */
/* short x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2; */
/* short si10, si20, si30, co10, co20, co30; */
/* short si11, si21, si31, co11, co21, co31; */
/* short * x, *w, * x2, * x0; */
/* short * y0, * y1, * y2, *y3; */
/* */
/* stride = n; -* n is the number of complex samples *- */
/* tw_offset = 0; */
/* while (stride > 4) // for all strides > 4 // */
/* { */
/* j = 0; */
/* fft_jmp = stride + (stride>>1); */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?