📄 icfftr2_dif.asm
字号:
;File was assembled with assembler version 4.0 (included with CCS V1.2) and
;NOT with assembler version 4.1 (included with CCS V2) due to "BUG" SDSsq21569.
;icfftr2_dif.asm file used as source in project in lieu of icfftr2_dif.sa.
;RC,August 2001.
;******************************************************************************
;* TMS320C6x ANSI C Codegen Version 4.00 *
;* Date/Time created: Mon Aug 20 20:26:58 2001 *
;******************************************************************************
;******************************************************************************
;* GLOBAL FILE PARAMETERS *
;* *
;* Architecture : TMS320C671x *
;* Optimization : Enabled at level 1 *
;* Optimizing for : Compile time 1st, speed 2nd *
;* Based on options: -o1, no -ms *
;* Endian : Little *
;* Interrupt Thrshld : Disabled *
;* Memory Model : Small *
;* Calls to RTS : Near *
;* Pipelining : Disabled *
;* Memory Aliases : Presume are aliases (pessimistic) *
;* Debug Info : Debug *
;* *
;******************************************************************************
FP .set A15
DP .set B14
SP .set B15
.global $bss
.file "icfftr2_dif.sa"
*===============================================================================
*
* From: https://www-a.ti.com/apps/c6000/xt_download.asp?sku=C67x_icfftr2
*
* TEXAS INSTRUMENTS, INC.
*
* INVERSE COMPLEX RADIX-2 DECIMATION-IN-FREQUENCY FFT
*
* Revision Date: 07/06/98
*
* USAGE
* This routine is C-callable and can be called as:
*
* void icfftr2_dif(float* x, float* w, short n)
*
* x[] --- input and output sequences (dim-n) (input/output)
* x has n complex numbers (2*n SP values).
* The real and imaginary values are interleaved
* in memory: re0:im0, re1:im1, .....
* w[] --- FFT coefficients (dim-n/2) (input)
* w has n/2 complex numbers (n SP values).
* FFT coeficients must be in bt-reversed order
* The real and imaginary values are interleaved
* in memory: re0:im0, re1:im1, .....
* n --- FFT size (input)
*
* If the routine is not to be used as a C-callable function,
* then all instructions relating to the stack should be removed.
* See comments of individual instructions to determine if they are
* related to the stack. You also need to initialize all passed
* parameters since these are assumed to be in registers as defined by
* the calling convention of the compiler, (See the C compiler
* reference guide.)
*
* C CODE
* This is the C equivalent of the assembly code without restrictions:
* Note that the assembly code is hand optimized and restrictions may
* apply.
*
* void icfftr2_dif(float* x, float* w, short n)
* {
* short n2, ie, ia, i, j, k, m;
* float rtemp, itemp, c, s;
*
* n2 = 1;
* ie = n;
* for(k=n; k > 1; k >>= 1)
* {
* ie >>= 1;
* ia = 0;
* for(j=0; j < ie; j++)
* {
* c = w[2*j];
* s = w[2*j+1];
* for(i=0; i < n2; i++)
* {
* m = ia + n2;
* rtemp = x[2*ia] - x[2*m];
* x[2*ia] = x[2*ia] + x[2*m];
* itemp = x[2*ia+1] - x[2*m+1];
* x[2*ia+1] = x[2*ia+1] + x[2*m+1];
* x[2*m] = c*rtemp - s*itemp;
* x[2*m+1] = c*itemp + s*rtemp;
* ia++;
* }
* ia += n2;
* }
* n2 <<= 1;
* }
* }
*
* DESCRIPTION
* This routine is used to compute the Inverse, Complex, Radix-2,
* Decimation-in-Frequency Fast Fourier Transform of a single
* precision complex sequence of size n, and a power of 2.
* The routine requires bit-reversed input and bit-reversed
* coefficents (twiddle factors) and produces results that are
* in normal order. Final scaling is not done in this function.
*
* TECHNIQUES
* 1. Loading input x as well as coefficient w in double word.
* 2. Both loops j and i shown in the C code are placed in the
* INNERLOOP of the assembly code.
* 3. mpy was used to perform a mv. EX. mpy x, 1, y <=> mv x, y
* 4. Because the data loads are 1 itteration ahead of the
* coefficent loads, counter i was copied to counter m so that
* the actual count could live longer for the coefficent loads.
* 5. Two output pointers/counters are maintained to remove the
* dependency between the X'a and Y's - the Y's have a much longer
* latency path than the X's.
* 6. Inner loop prolog and epilog are done in parallel with the
* outer loop.
*
* ASSUMPTIONS
* n >= 8
*
* Both input x and coefficient w should be aligned on double word
* (8 byte) boundary.
*
* The follwoing C code is used to generate the coefficient table
* (non-bit reversed).
*
* #include <math.h>
* /* generate real and imaginary twiddle
* table of size n/2 complex numbers */
*
* gen_w_r2(float* w, int n)
* {
* int i;
* float pi = 4.0*atan(1.0);
* float e = pi*2.0/n;
*
* for(i=0; i < ( n>>1 ); i++)
* {
* w[2*i] = cos(i*e);
* w[2*i+1] = sin(i*e);
* }
* }
*
*
* The follwoing C code is used to bit-reverse the coefficents.
*
* bit_rev(float* x, int n)
* {
* int i, j, k;
* float rtemp, itemp;
*
* j = 0;
* for(i=1; i < (n-1); i++)
* {
* k = n >> 1;
* while(k <= j)
* {
* j -= k;
* k >>= 1;
* }
* j += k;
* if(i < j)
* {
* rtemp = x[j*2];
* x[j*2] = x[i*2];
* x[i*2] = rtemp;
* itemp = x[j*2+1];
* x[j*2+1] = x[i*2+1];
* x[i*2+1] = itemp;
* }
* }
* }
*
* The follwoing C code is used to perform the final scaling
* of the IFFT.
*
* /* divide each element of x by n */
* divide(float* x, int n)
* {
* int i;
* float inv = 1.0 / n;
*
* for(i=0; i < n; i++)
* {
* x[2*i] = inv * x[2*i];
* x[2*i+1] = inv * x[2*i+1];
* }
* }
*
*
* MEMORY NOTE
*
* Data (x) 8*N bytes
* Coefficients (w) 8*N/2 bytes
* Stack 4*10 bytes
* Program 800 bytes
*
* Note 1: Data and Coefficents must reside in different memory
* blocks to avoid memory conflicts.
*
* Note 2: Data and Coefficents must be aligned to an 8 byte
* boundary.
*
* CYCLES
*
* # of cycles = 21 + 4 + M*((N/2-2)*4 + 24)
* / \ \ \
* C preservation ___/ \ \ \___ Loop L Prolog/Epilog
* \ \ +
* \ \ Loop K
* \ \
* \ \___ Loop L
* \
* \___ Loop K Prolog
*
* where: N is the number of point in the IFFT
* M = log(base 2)N, is the number of stages in the IFFT
*
* Example: 1024 Point FFT Performance
* N = 1024, M = 10, assume a 167MHz CPU clock
*
* # of cycles = 21 + 4 + 10*((1024/2-2)*4 + 24) = 20665
*
* time = # of cycles * CPU clock = 20665/167*10^6 = 124.0 usec
*
*
* EXAMPLE USAGE
*
* void main(void)
* {
* gen_w_r2(w, N); // Generate coefficient table
* bit_rev(w, N>>1); // Bit-reverse coefficient table
* cfftr2_dit(x, w, N); // This is the radix 2 FFT benchmark available
* // from TI
* // input in normal order, output in bit-reversed
* // order
* // coefficient table in bit-reversed order
* icfftr2_dif(x, w, N); // Inverse radix 2 FFT
* // input in bit-reversed order, output in normal
* // order
* // coefficient table in bit-reversed order
* divide(x, N); // scale inverse FFT output
* // result is the same as original input to FFT
* }
*
* Since the twiddle table is in bit-reversed order, it turns out that
* the same twiddle table will also work for smaller IFFTs. This
* means that if you need to do both 512 and 1024 point IFFTs in the
* same application, you only need to have the 1024 point twiddle
* table. The 512 point FFT will use the first half of the 1024
* point twiddle table.
*
*
* IMPLEMENTATION
*
* The above C implemetation of the IFFT has been modified to better fit
* the 'C67xx architecture thus allowing the translation from C to hand
* coded assembly easier. The modified function is listed below and is
* functionally equivelent to the above function. Note, the C statements
* in this function are used as comment for the equivelent assembly
* statements (see the optimized assembly listeing).
*
* void icfftr2_dif(float* x, float* w, short n)
* {
* short n2, i, k, l, p, m, j, n2A;
* float rtemp, itemp, s, c, xr, xi, yr, yi, Xr, Xi, Yr, Yi;
* float *wptrB, *xinptrA, *xoutptrB, *xoutptrA, p1r, p2r, p1i, p2i;
*
* n2 = 1;
* wptrB = w;
* xinptrA = x;
* xoutptrB = x;
* c=*wptrB++;
* s=*wptrB++;
* xoutptrA = xoutptrB + 2*n2;
* xoutptrB = 1 + xoutptrB;
* i = n2;
* j = n2;
* p = n2;
*
* for(k=n; k > 1; k >>= 1)
* {
* for (l=0; l<n/2; l++)
* {
* yr = xinptrA[2*n2];
* yi = xinptrA[2*n2 + 1];
* xr = *xinptrA++;
* xi = *xinptrA++;
* j = j - 1;
* i = i - 1;
* p = p -1;
* itemp = xi - yi;
* rtemp = xr - yr;
* Xi = xi + yi;
* Xr = xr + yr;
* if (i==0) xinptrA = xinptrA + 2*n2;
* p1i = c*itemp;
* p2i = s*rtemp;
* p1r = c*rtemp;
* p2r = s*itemp;
* *xoutptrB = Xi;
* xoutptrB = xoutptrB + 2;
* *(xoutptrB - 3) = Xr;
* Yi = p1i + p2i;
* Yr = p1r - p2r;
* if (p==0) xoutptrB = xoutptrB + 2*n2;
* m = i;
* if (i==0) i = n2;
* *xoutptrA++ = Yr;
* *xoutptrA++ = Yi;
* if (j==0) xoutptrA = xoutptrA + 2*n2;
* if (m==0) {
* c=*wptrB++;
* s=*wptrB++;
* }
* if (p==0) p = n2;
* if (j==0) j = n2;
* }
*
* n2 = n2 << 1;
* xinptrA = x;
* xoutptrB = x;
* wptrB = w;
* c=*wptrB++;
* s=*wptrB++;
* i = n2;
* j = n2;
* p = n2;
* xoutptrA = xoutptrB + 2*n2;
* xoutptrB = xoutptrB + 1;
* }
* }
*
*
* NOTATIONS
*
* f = Function Prolog or Epilog
* o = Outer Loop
* p = Inner Loop Prolog
* e = Inner Loop Epilog
*
*===============================================================================
;void icfftr2_dif(float* x, float* w, short n)
; {
.def _icfftr2_dif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -