📄 fftspx_h.asm

📁 TMS320bbs（源程序）的c67xfiles文件。用于在CCS2.0集成编译环境下实现TI的c67x系列DSP开发。是用DSP汇编语言
💻 ASM
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
* ============================================================================ *
*                                                                              *
*     TEXAS INSTRUMENTS, INC.                                                  *
*                                                                              *
*     NAME                                                                     *
*           fftSPxSP                                                           *
*                                                                              *
*     USAGE                                                                    *
*           This routine is C-callable and can be called as:                   *
*                                                                              *
*           void fftSPxSP_asm(                                                 *
*               int N, float * ptr_x, float * ptr_w, float * ptr_y,            *
*               unsigned char brev[], int n_min, int offset, int n_max);       *
*                                                                              *
*           N = length of fft in complex samples, power of 2 < 8192            *
*           ptr_x = pointer to complex data input                              *
*           ptr_w = pointer to complex twiddle factor (see below)              *
*           brev = pointer to bit reverse table containing 64 entries          *
*           n_min = smallest fft butterfly used in computation                 *
*                   used for decomposing fft into subffts, see notes           *
*           offset = index in complex samples of sub-fft from start of main fft*
*           n_max = size of main fft in complex samples                        *
*                                                                              *
*           (See the C compiler reference guide.)                              *
*                                                                              *
*     DESCRIPTION                                                              *
*          The benchmark performs a mixed radix forwards fft using             *
*          a special sequece of coefficients generated in the following        *
*          way:                                                                *
*                                                                              *
*            /* generate vector of twiddle factors for optimized algorithm */  *
*           void tw_gen(float * w, int N)                                      *
*           {                                                                  *
*             int j, k;                                                        *
*             double x_t, y_t, theta1, theta2, theta3;                         *
*             const double PI = 3.141592654;                                   *
*                                                                              *
*             for (j=1, k=0; j <= N>>2; j = j<<2)                              *
*             {                                                                *
*                 for (i=0; i < N>>2; i+=j)                                    *
*                 {                                                            *
*                     theta1 = 2*PI*i/N;                                       *
*                     x_t = cos(theta1);                                       *
*                     y_t = sin(theta1);                                       *
*                     w[k]   =  (float)x_t;                                    *
*                     w[k+1] =  (float)y_t;                                    *
*                                                                              *
*                     theta2 = 4*PI*i/N;                                       *
*                     x_t = cos(theta2);                                       *
*                     y_t = sin(theta2);                                       *
*                     w[k+2] =  (float)x_t;                                    *
*                     w[k+3] =  (float)y_t;                                    *
*                                                                              *
*                     theta3 = 6*PI*i/N;                                       *
*                     x_t = cos(theta3);                                       *
*                     y_t = sin(theta3);                                       *
*                     w[k+4] =  (float)x_t;                                    *
*                     w[k+5] =  (float)y_t;                                    *
*                     k+=6;                                                    *
*                 }                                                            *
*             }                                                                *
*           }                                                                  *
*          This redundent set of twiddle factors is size 2*N float samples.    *
*          The function is accurate to about 130dB of signal to noise ratio    *
*          to the DFT function below:                                          *
*                                                                              *
*           void dft(int n, float x[], float y[])                              *
*           {                                                                  *
*              int k,i, index;                                                 *
*              const float PI = 3.14159654;                                    *
*              float * p_x;                                                    *
*              float arg, fx_0, fx_1, fy_0, fy_1, co, si;                      *
*                                                                              *
*              for(k = 0; k<n; k++)                                            *
*              {                                                               *
*                p_x = x;                                                      *
*                fy_0 = 0;                                                     *
*                fy_1 = 0;                                                     *
*                for(i=0; i<n; i++)                                            *
*                {                                                             *
*                  fx_0 = p_x[0];                                              *
*                  fx_1 = p_x[1];                                              *
*                  p_x += 2;                                                   *
*                  index = (i*k) % n;                                          *
*                  arg = 2*PI*index/n;                                         *
*                  co = cos(arg);                                              *
*                  si = -sin(arg);                                             *
*                  fy_0 += ((fx_0 * co) - (fx_1 * si));                        *
*                  fy_1 += ((fx_1 * co) + (fx_0 * si));                        *
*                }                                                             *
*                y[2*k] = fy_0;                                                *
*                y[2*k+1] = fy_1;                                              *
*              }                                                               *
*           }                                                                  *
*          The function takes the table and input data and calculates the fft  *
*          producing the frequency domain data in the Y array.                 *
*          As the fft allows every input point to effect every output point in *
*          a cache based system such as the c6711, this causes cache thrashing.*
*          This is mitigated by allowing the main fft of size N to be divided  *
*          into several steps, allowing as much data reuse as possible.        *
*                                                                              *
*          For example the following function:                                 *
*                                                                              *
*          fftSPxSP_asm(1024, &x_asm[0],&w[0],y_asm,brev,4,  0,1024);          *
*                                                                              *
*          is equvalent to:                                                    *
*                                                                              *
*          fftSPxSP_asm(256, &x_asm[2*0],  &w[2*768],y_asm,brev,4,    0,1024); *
*          fftSPxSP_asm(256, &x_asm[2*256],&w[2*768],y_asm,brev,4,  256,1024); *
*          fftSPxSP_asm(256, &x_asm[2*512],&w[2*768],y_asm,brev,4,  512,1024); *
*          fftSPxSP_asm(256, &x_asm[2*768],&w[2*768],y_asm,brev,4,  768,1024); *
*                                                                              *
*          Notice how the 1st fft function is called on the entire 1K data set *
*          it covers the 1st pass of the fft until the butterfly size is 256.  *
*          The following 4 ffts do 256 pt ffts 25% of the size. These continue *
*          down to the end when the buttefly is of size 4. The use an index to *
*          the main twiddle factor array of 0.75*2*N. This is because the      *
*          twiddle factor array is composed of successively decimated versions *
*          of the main array.                                                  *
*                                                                              *
*          N not equal to a power of 4 can be used, i.e. 512. In this case to  *
*          decompose the fft the following would be needed :                   *
*                                                                              *
*          fftSPxSP_asm(512, &x_asm[0],&w[0],y_asm,brev,2,  0,512);            *
*                                                                              *
*          is equvalent to:                                                    *
*                                                                              *
*          fftSPxSP_asm(128, &x_asm[2*0],  &w[2*384],y_asm,brev,4,    0,512);  *
*          fftSPxSP_asm(128, &x_asm[2*128],&w[2*384],y_asm,brev,4,  128,512);  *
*          fftSPxSP_asm(128, &x_asm[2*256],&w[2*384],y_asm,brev,4,  256,512);  *
*          fftSPxSP_asm(128, &x_asm[2*384],&w[2*384],y_asm,brev,4,  384,512);  *
*                                                                              *
*          The twiddle factor array is composed of log4(N) sets of twiddle     *
*          factors, (3/4)*N, (3/16)*N, (3/64)*N, etc.  The index into this     *
*          array for each stage of the fft is calculated by summing these      *
*          indices up appropriately.                                           *
*          For multiple ffts they can share the same table by calling the small*
*          ffts from further down in the twiddle factor array. In the same way *
*          as the decomposition works for more data reuse.                     *
*                                                                              *
*     ASSUMPTIONS:                                                             *
*          n must be a power of 2 and n >= 8  n <= 16384 points.               *
*          Complex time data x and twiddle facotrs w are aligned on double     *
*          word boundares. Real values are stored in even word positions and   *
*          imaginary values in odd positions.                                  *
*                                                                              *
*          All data is in single precision floating point format. The complex  *
*          frequency data will be returned in linear order.                    *
*          This code is interupti tolerant, interupts are disabled on entry to *
*          the function and reanlbed on exit.                                  *
*                                                                              *
*     MEMORY NOTE:                                                             *
*          Configuration is BI ENDIAN either tools flag -me or none will give  *
*          the same results. No memory bank hits occur in this code.           *
*                                                                              *
*     TECHNIQUES                                                               *
*          A special sequence of coeffs. used as generated above               *
*          produces the fft. This collapses the inner 2 loops in the           *
*          taditional Burrus and Parks implementation Fortran Code.            *
*                                                                              *
*          The revised FFT uses a redundant sequence of twiddle factors to     *
*          allow a linear access through the data. This linear access enables  *
*          data and instruction level parallelism.                             *
*          The data produced by the fftSPxSP fft is in normal form, the        *
*          whole data array is written into a new output buffer.               *
*                                                                              *
*          The fftSPxSP butterfly is bit reversed, i.e. the inner 2 points of  *
*          the butterfly are corssed over, this has the effect of making the   *
*          data come out in bit reversed rather than fftSPxSP digit reversed   *
*          order. This simplifies the last pass of the loop. ia simple table   *
*          is used to do the bit reversal out of place.                        *
*                                                                              *
*              unsigned char brev[64] = {                                      *
*                    0x0, 0x20, 0x10, 0x30, 0x8, 0x28, 0x18, 0x38,             *
*                    0x4, 0x24, 0x14, 0x34, 0xc, 0x2c, 0x1c, 0x3c,             *
*                    0x2, 0x22, 0x12, 0x32, 0xa, 0x2a, 0x1a, 0x3a,             *
*                    0x6, 0x26, 0x16, 0x36, 0xe, 0x2e, 0x1e, 0x3e,             *
*                    0x1, 0x21, 0x11, 0x31, 0x9, 0x29, 0x19, 0x39,             *
*                    0x5, 0x25, 0x15, 0x35, 0xd, 0x2d, 0x1d, 0x3d,             *
*                    0x3, 0x23, 0x13, 0x33, 0xb, 0x2b, 0x1b, 0x3b,             *
*                    0x7, 0x27, 0x17, 0x37, 0xf, 0x2f, 0x1f, 0x3f              *
*              };                                                              *
*                                                                              *
*     C CODE                                                                   *
*           This is the C equivalent of the assembly code without restrictions:*
*           Note that the assembly code is hand optimized and restrictions may *
*           apply.                                                             *
*                                                                              *
*                                                                              *
*         void fftSPxSP_co(int n, float ptr_x[], float ptr_w[], float ptr_y[], *
*                      unsigned char brev[], int n_min, int offset, int n_max) *
*         {                                                                    *
*            int  i, j, k, l1, l2, h2, predj;                                  *
*            int  tw_offset, stride, fft_jmp;                                  *
*                                                                              *
*            float x0, x1, x2, x3,x4,x5,x6,x7;                                 *
*            float xt0, yt0, xt1, yt1, xt2, yt2, yt3;                          *
*            float yt4, yt5, yt6, yt7;                                         *
*            float si1,si2,si3,co1,co2,co3;                                    *
*            float xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21;                        *
*            float x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1;        *
*            float xl0_0, xl1_0, xl0_1, xl1_1;                                 *
*            float xh0_0, xh1_0, xh0_1, xh1_1;                                 *
*            float *x,*w;                                                      *
*            int   k0, k1, j0, j1, l0, radix;                                  *
*            float * y0, * ptr_x0, * ptr_x2;                                   *
*                                                                              *
*            radix = n_min;                                                    *
*                                                                              *
*            stride = n; /* n is the number of complex samples */              *
*            tw_offset = 0;                                                    *
*            while (stride > radix)                                            *
*            {                                                                 *
*                j = 0;                                                        *
*                fft_jmp = stride + (stride>>1);                               *
*                h2 = stride>>1;                                               *
*                l1 = stride;                                                  *
*                l2 = stride + (stride>>1);                                    *
*                x = ptr_x;                                                    *
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -