📄 dsp_fft32x32.c
字号:
/* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *//* xl20 = x[2 * i1 ] - x[2 * i3 ]; *//* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *//* *//* xt1 = xl0 + xl21; *//* yt2 = xl1 + xl20; *//* xt2 = xl0 - xl21; *//* yt1 = xl1 - xl20; *//* *//* xl1_xl0 = _sub2(x21_x20, x21_x20) *//* xl21_xl20 = _sub2(x32_x22, x23_x22) *//* xl20_xl21 = _rotl(xl21_xl20, 16) *//* *//* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) *//* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) *//* *//* Also notice that xt1, yt1 endup on seperate words, these need to *//* be packed together to take advantage of the packed twiddle fact *//* ors that have been loaded. In order for this to be achieved they *//* are re-aligned as follows: *//* *//* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) *//* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) *//* *//* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor *//* to be used for the complex multiplies. The real part os the *//* complex multiply is implemented using _dotp2. The imaginary *//* part of the complex multiply is implemented using _dotpn2 *//* after the twiddle factors are swizzled within the half word. *//* *//* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). *//* *//* The actual twiddle factors for the FFT are cosine, - sine. The *//* twiddle factors stored in the table are csine and sine, hence *//* the sign of the "sine" term is comprehended during multipli- *//* cation as shown above. *//* *//* *//* ASSUMPTIONS *//* *//* The size of the FFT, n, must be a power of 4 and greater than *//* or equal to 16 and less than 32768. *//* *//* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a *//* double-word boundary for the "optimized" implementations. *//* *//* The input and output data are complex, with the real/imaginary *//* components stored in adjacent locations in the array. The real *//* components are stored at even array indices, and the imaginary *//* components are stored at odd array indices. *//* *//* ------------------------------------------------------------------------ *//* Copyright (c) 2007 Texas Instruments, Incorporated. *//* All Rights Reserved. *//* ======================================================================== */#pragma CODE_SECTION(DSP_fft32x32_i, ".text:intrinsic");#include "DSP_fft32x32.h"/*--------------------------------------------------------------------------*//* The following macro is used to obtain a digit reversed index, of a given *//* number i, into j where the number of bits in "i" is "m". For the natural *//* form of C code, this is done by first interchanging every set of "2 bit" *//* pairs, followed by exchanging nibbles, followed by exchanging bytes, and *//* finally halfwords. To give an example, condider the following number: *//* *//* N = FEDCBA9876543210, where each digit represents a bit, the following *//* steps illustrate the changes as the exchanges are performed: *//* M = DCFE98BA54761032 is the number after every "2 bits" are exchanged. *//* O = 98BADCFE10325476 is the number after every nibble is exchanged. *//* P = 1032547698BADCFE is the number after every byte is exchanged. *//* Since only 16 digits were considered this represents the digit reversed *//* index. Since the numbers are represented as 32 bits, there is one more *//* step typically of exchanging the half words as well. *//*--------------------------------------------------------------------------*/ #if 0# define DIG_REV(i, m, j) ((j) = (_shfl(_rotl(_bitr(_deal(i)), 16)) >> (m)))#else# define DIG_REV(i, m, j) \ do { \ unsigned _ = (i); \ _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \ _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \ _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \ _ = ((_ & 0x0000FFFF) << 16) | ((_ & ~0x0000FFFF) >> 16); \ (j) = _ >> (m); \ } while (0)#endifstatic inline void radix_2(int *restrict ptr_x, int *restrict ptr_y, int npoints);static inline void radix_4(int *restrict ptr_x, int *restrict ptr_y, int npoints);void DSP_fft32x32_i ( const int * restrict ptr_w, int npoints, int * restrict ptr_x, int * restrict ptr_y){ int * restrict w; int * restrict x, * restrict x2, * restrict x0; int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp, radix; int si10, si20, si30, co10, co20, co30; int si11, si21, si31, co11, co21, co31; int xt0_0, yt0_0, yt0_1, xt0_1; int xt1_0, yt2_0, xt1_1, yt2_1, xt2_0, yt1_0, xt2_1, yt1_1; int p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF; int p10, p11, p12, p13, p14, p15, p16, p17; int xh2_0o, xh2_1o, xh2_2o, xh2_3o; int xl1_0o, xl1_1o, xl1_2o, xl1_3o; int xl2_0o, xl2_1o, xl2_2o, xl2_3o; double ydword0, ydword1, yl1dword0, yl1dword1, yh2dword0, yh2dword1; double x_10, x_32; double x_l1_10, x_l1_32, x_l2_10, x_l2_32, x_h2_10,x_h2_32; double co10_si10, co20_si20, co30_si30; double co11_si11, co21_si21, co31_si31; long long xh20_0_xl20_0, xh21_0_xl21_0, xh20_1_xl20_1, xh21_1_xl21_1; long long xt1_0_xt2_0, yt2_0_yt1_0, xt1_1_xt2_1, yt2_1_yt1_1; long long xh0_0_xl0_0, xh1_0_xl1_0, xh0_1_xl0_1, xh1_1_xl1_1; long long x_0o_xt0_0, x_1o_yt0_0, x_2o_xt0_1, x_3o_yt0_1; /*---------------------------------------------------------------------*/ /* Determine the magnitude od the number of points to be transformed. */ /* Check whether we can use a radix4 decomposition or a mixed radix */ /* transformation, by determining modulo 2. */ /*---------------------------------------------------------------------*/ radix = _norm(npoints) & 1 ? 2 : 4; /*----------------------------------------------------------------------*/ /* The stride is quartered with every iteration of the outer loop. It */ /* denotes the seperation between any two adjacent inputs to the butter */ /* -fly. This should start out at N/4, hence stride is initially set to */ /* N. For every stride, 6*stride twiddle factors are accessed. The */ /* "tw_offset" is the offset within the current twiddle factor sub- */ /* table. This is set to zero, at the start of the code and is used to */ /* obtain the appropriate sub-table twiddle pointer by offseting it */ /* with the base pointer "ptr_w". */ /*----------------------------------------------------------------------*/ stride = npoints; tw_offset = 0; fft_jmp = 6 * stride; _nassert(stride > 4); #pragma MUST_ITERATE(1,,1); while (stride > radix) { /*-----------------------------------------------------------------*/ /* At the start of every iteration of the outer loop, "j" is set */ /* to zero, as "w" is pointing to the correct location within the */ /* twiddle factor array. For every iteration of the inner loop */ /* 6 * stride twiddle factors are accessed. For eg, */ /* */ /* #Iteration of outer loop # twiddle factors #times cycled */ /* 1 6 N/4 1 */ /* 2 6 N/16 4 */ /* ... */ /*-----------------------------------------------------------------*/ j = 0; fft_jmp >>= 2; /*-----------------------------------------------------------------*/ /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */ /* "N/2", "N", "3N/2" half word */ /*-----------------------------------------------------------------*/ h2 = stride >> 1; l1 = stride; l2 = stride + (stride >> 1); /*-----------------------------------------------------------------*/ /* Reset "x" to point to the start of the input data array. */ /* "tw_offset" starts off at 0, and increments by "6 * stride" */ /* The stride quarters with every iteration of the outer loop */ /*-----------------------------------------------------------------*/ x = ptr_x; w = (int *)ptr_w + tw_offset; tw_offset += fft_jmp; stride >>= 2; /*----------------------------------------------------------------*/ /* The following loop iterates through the different butterflies, */ /* within a given stage. Recall that there are logN to base 4 */ /* stages. Certain butterflies share the twiddle factors. These */ /* are grouped together. On the very first stage there are no */ /* butterflies that share the twiddle factor, all N/4 butter- */ /* flies have different factors. On the next stage two sets of */ /* N/8 butterflies share the same twiddle factor. Hence after */ /* half the butterflies are performed, j the index into the */ /* factor array resets to 0, and the twiddle factors are reused. */ /* When this happens, the data pointer 'x' is incremented by the */ /* fft_jmp amount. */ /*----------------------------------------------------------------*/ _nassert((int)(w) % 8 == 0); _nassert((int)(x) % 8 == 0); _nassert(h2 % 8 == 0); _nassert(l1 % 8 == 0); _nassert(l2 % 8 == 0); _nassert(npoints >= 16); #pragma MUST_ITERATE(16,,16); for (i = 0; i < (npoints >> 3); i++) { /*------------------------------------------------------------*/ /* Read the first six twiddle factor values. This loop */ /* computes two radix 4 butterflies at a time. */ /* si10 = w[0] co10 = w[1] si20 = w[2] co20 = w[3] */ /* si30 = w[4] co30 = w[5] si11 = w[6] co11 = w[7] */ /* si21 = w[8] co21 = w[9] si31 = w[a] co31 = w[b] */ /*------------------------------------------------------------*/ co10_si10 = _amemd8_const(&w[j]); co20_si20 = _amemd8_const(&w[j + 2]); co30_si30 = _amemd8_const(&w[j + 4]); co11_si11 = _amemd8_const(&w[j + 6]); co21_si21 = _amemd8_const(&w[j + 8]); co31_si31 = _amemd8_const(&w[j + 10]); si10 = _lo(co10_si10); co10 = _hi(co10_si10); si20 = _lo(co20_si20); co20 = _hi(co20_si20); si30 = _lo(co30_si30); co30 = _hi(co30_si30); si11 = _lo(co11_si11); co11 = _hi(co11_si11); si21 = _lo(co21_si21); co21 = _hi(co21_si21); si31 = _lo(co31_si31); co31 = _hi(co31_si31); /*-------------------------------------------------------------*/ /* Read in the data elements for the eight inputs of two */ /* radix4 butterflies. */ /* x[0] x[1] x[2] x[3] */ /* x[h2+0] x[h2+1] x[h2+2] x[h2+3] */ /* x[l1+0] x[l1+1] x[l1+2] x[l1+3] */ /* x[l2+0] x[l2+1] x[l2+2] x[l2+3] */ /*-------------------------------------------------------------*/
x_10 = _amemd8(&x[0]); x_32 = _amemd8(&x[2]); x_l1_10 = _amemd8(&x[l1]); x_l1_32 = _amemd8(&x[l1 + 2]); x_l2_10 = _amemd8(&x[l2]); x_l2_32 = _amemd8(&x[l2 + 2]); x_h2_10 = _amemd8(&x[h2]); x_h2_32 = _amemd8(&x[h2 + 2]);
/*-------------------------------------------------------------*/ /* xh0_0 = x[0] + x[l1]; xh1_0 = x[1] + x[l1+1] */ /* xh0_1 = x[2] + x[l1+2]; xh1_1 = x[3] + x[l1+3] */ /* xl0_0 = x[0] - x[l1]; xl1_0 = x[1] - x[l1+1] */ /* xl0_1 = x[2] - x[l1+2]; xl1_1 = x[3] - x[l1+3] */ /*-------------------------------------------------------------*/ xh0_0_xl0_0 = _addsub(_lo(x_10), _lo(x_l1_10)); xh1_0_xl1_0 = _addsub(_hi(x_10), _hi(x_l1_10)); xh0_1_xl0_1 = _addsub(_lo(x_32), _lo(x_l1_32));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -