📄 icfftr2_dif.asm

📁 教程《DSP 原理及其 C 编程开发技术》中的源代码
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
;File was assembled with assembler version 4.0 (included with CCS V1.2) and
;NOT with assembler version 4.1 (included with CCS V2) due to "BUG" SDSsq21569.
;icfftr2_dif.asm file used as source in project in lieu of icfftr2_dif.sa. 
;RC,August 2001.


;******************************************************************************
;* TMS320C6x ANSI C Codegen                                      Version 4.00 *
;* Date/Time created: Mon Aug 20 20:26:58 2001                                *
;******************************************************************************

;******************************************************************************
;* GLOBAL FILE PARAMETERS                                                     *
;*                                                                            *
;*   Architecture      : TMS320C671x                                          *
;*   Optimization      : Enabled at level 1                                   *
;*   Optimizing for    : Compile time 1st, speed 2nd                          *
;*                       Based on options: -o1, no -ms                        *
;*   Endian            : Little                                               *
;*   Interrupt Thrshld : Disabled                                             *
;*   Memory Model      : Small                                                *
;*   Calls to RTS      : Near                                                 *
;*   Pipelining        : Disabled                                             *
;*   Memory Aliases    : Presume are aliases (pessimistic)                    *
;*   Debug Info        : Debug                                                *
;*                                                                            *
;******************************************************************************

FP	.set	A15
DP	.set	B14
SP	.set	B15
	.global	$bss

	.file	"icfftr2_dif.sa"
*=============================================================================== 
*
* From: https://www-a.ti.com/apps/c6000/xt_download.asp?sku=C67x_icfftr2
* 
*     TEXAS INSTRUMENTS, INC. 
* 
*     INVERSE COMPLEX RADIX-2 DECIMATION-IN-FREQUENCY FFT 
* 
*     Revision Date: 07/06/98
* 
*     USAGE 
*           This routine is C-callable and can be called as: 
* 
*           void icfftr2_dif(float* x, float* w, short n) 
* 
*		x[] --- input and output sequences (dim-n)      (input/output)
*			x has n complex numbers (2*n SP values).  
*			The real and imaginary values are interleaved
*			in memory: re0:im0, re1:im1, .....
*               w[]  --- FFT coefficients (dim-n/2)             (input)
*			w has n/2 complex numbers (n SP values). 
*			FFT coeficients must be in bt-reversed order
*			The real and imaginary values are interleaved
*			in memory: re0:im0, re1:im1, .....
*               n    --- FFT size                               (input)       
* 
*           If the routine is not to be used as a C-callable function, 
*           then all instructions relating to the stack should be removed. 
*           See comments of individual instructions to determine if they are 
*           related to the stack. You also need to initialize all passed 
*           parameters since these are assumed to be in registers as defined by 
*           the calling convention of the compiler, (See the C compiler 
*           reference guide.) 
*
*     C CODE 
*           This is the C equivalent of the assembly code without restrictions: 
*           Note that the assembly code is hand optimized and restrictions may 
*           apply. 
* 
*		void icfftr2_dif(float* x, float* w, short n)
*		{
*		   short n2, ie, ia, i, j, k, m;
*		   float rtemp, itemp, c, s;
*		
*		   n2 = 1;
*		   ie = n;
*		   for(k=n; k > 1; k >>= 1)
*		   {
*		      ie >>= 1;
*		      ia = 0;
*		      for(j=0; j < ie; j++)
*		      {
*		         c = w[2*j];
*		         s = w[2*j+1];
*		         for(i=0; i < n2; i++)
*		         {
*		            m = ia + n2;
*		            rtemp     = x[2*ia]   - x[2*m];
*		            x[2*ia]   = x[2*ia]   + x[2*m];
*		            itemp     = x[2*ia+1] - x[2*m+1];
*		            x[2*ia+1] = x[2*ia+1] + x[2*m+1];
*		            x[2*m]    = c*rtemp   - s*itemp;
*		            x[2*m+1]  = c*itemp   + s*rtemp;
*			    ia++;
*		         }
*		         ia += n2;
*		      }
*		      n2 <<= 1;
*		   }
*		}
* 
*       DESCRIPTION
*		This routine is used to compute the Inverse, Complex, Radix-2, 
*		Decimation-in-Frequency Fast Fourier Transform of a single
*		precision complex sequence of size n, and a power of 2.
* 		The routine requires bit-reversed input and bit-reversed 
*		coefficents (twiddle factors) and produces results that are
* 		in normal order.  Final scaling is not done in this function.
*
*       TECHNIQUES
*            1. Loading input x as well as coefficient w in double word.
*            2. Both loops j and i shown in the C code are placed in the
*               INNERLOOP of the assembly code.
*            3. mpy was used to perform a mv.  EX. mpy x, 1, y <=> mv x, y
*	     4. Because the data loads are 1 itteration ahead of the 
*		coefficent loads, counter i was copied to counter m so that
*	 	the actual count could live longer for the coefficent loads.
*            5. Two output pointers/counters are maintained to remove the
*		dependency between the X'a and Y's - the Y's have a much longer
*		latency path than the X's.
*	     6.	Inner loop prolog and epilog are done in parallel with the 
*		outer loop.
*
*       ASSUMPTIONS
*		n >= 8
*
*               Both input x and coefficient w should be aligned on double word
*               (8 byte) boundary.
* 	
*		The follwoing C code is used to generate the coefficient table 
*		(non-bit reversed).
*
*		#include <math.h>
*		/* generate real and imaginary twiddle 
*			table of size n/2 complex numbers */
*
*		gen_w_r2(float* w, int n)
*		{
*	   	int i;
*	   	float pi = 4.0*atan(1.0);
*	   	float e = pi*2.0/n;
*
*	   		for(i=0; i < ( n>>1 ); i++)
*	   		{
*	     			w[2*i]   = cos(i*e);
*	     			w[2*i+1] = sin(i*e);
*	   		}
*		}
*
*
* 		The follwoing C code is used to bit-reverse the coefficents.
*
*		bit_rev(float* x, int n)
*		{
*		  int i, j, k;
*		  float rtemp, itemp;
*
*		  j = 0;
*		  for(i=1; i < (n-1); i++)
*		  {
*		     k = n >> 1;
*		     while(k <= j)
*		     {
*		        j -= k;
*		        k >>= 1;
*		     }
*		     j += k;
*		     if(i < j)
*		     {
*		        rtemp    = x[j*2];
*		        x[j*2]   = x[i*2];
*		        x[i*2]   = rtemp;
*		        itemp    = x[j*2+1];
*		        x[j*2+1] = x[i*2+1];
*		        x[i*2+1] = itemp;
*		     }
*		  }
*		}
*
* 		The follwoing C code is used to perform the final scaling 
*		of the IFFT.
*
*		/* divide each element of x by n */
*		divide(float* x, int n)
*		{
*		   int i;
*		   float inv = 1.0 / n;
*
*		   for(i=0; i < n; i++)
*		   {
*		     x[2*i]   = inv * x[2*i];
*		     x[2*i+1] = inv * x[2*i+1];
*		   }
*		}
*
*
* 	MEMORY NOTE
*
*		Data (x)		8*N    bytes
*		Coefficients (w)	8*N/2  bytes
*		Stack			4*10   bytes
*		Program			800    bytes
*	
*		Note 1:  Data and Coefficents must reside in different memory
*		blocks to avoid memory conflicts.
*
*		Note 2:	 Data and Coefficents must be aligned to an 8 byte 
*		boundary.
*
*       CYCLES 
*
*	   # of cycles = 21 + 4 + M*((N/2-2)*4 + 24)
*                        /    \         \         \
*    C preservation  ___/      \         \         \___ Loop L Prolog/Epilog
*                               \         \                     +
*                                \         \                  Loop K
*                                 \         \
*                                  \         \___ Loop L
*                                   \
*                                    \___ Loop K Prolog
*
*		where:  N is the number of point in the IFFT
*			M = log(base 2)N, is the number of stages in the IFFT
*
*	Example:  1024 Point FFT Performance
*		N = 1024, M = 10, assume a 167MHz CPU clock
*
*		# of cycles = 21 + 4 + 10*((1024/2-2)*4 + 24) = 20665
*
*		time = # of cycles * CPU clock = 20665/167*10^6 = 124.0 usec
*
*
* 	EXAMPLE USAGE
*
* void main(void)
* {
*    gen_w_r2(w, N);         // Generate coefficient table
*    bit_rev(w, N>>1);       // Bit-reverse coefficient table
*    cfftr2_dit(x, w, N);    // This is the radix 2 FFT benchmark available 
*			     // from TI
*                            // input in normal order, output in bit-reversed 
*			     // order
*                            // coefficient table in bit-reversed order
*    icfftr2_dif(x, w, N);   // Inverse radix 2 FFT
*                            // input in bit-reversed order, output in normal 
*			     // order
*                            // coefficient table in bit-reversed order
*    divide(x, N);           // scale inverse FFT output
*                            // result is the same as original input to FFT
* }
*
*	Since the twiddle table is in bit-reversed order, it turns out that
*	the same twiddle table will also work for smaller IFFTs.  This
*	means that if you need to do both 512 and 1024 point IFFTs in the
*	same application, you only need to have the 1024 point twiddle
*	table.  The 512 point FFT will use the first half of the 1024
*	point twiddle table.
*
*
* 	IMPLEMENTATION
*
*	The above C implemetation of the IFFT has been modified to better fit
*	the 'C67xx architecture thus allowing the translation from C to hand
*	coded assembly easier.  The modified function is listed below and is 
*	functionally equivelent to the above function.  Note, the C statements
*	in this function are used as comment for the equivelent assembly
*	statements (see the optimized assembly listeing).
*
* void icfftr2_dif(float* x, float* w, short n)
* {
*	short n2, i, k, l, p, m, j, n2A;
*	float rtemp, itemp, s, c, xr, xi, yr, yi, Xr, Xi, Yr, Yi;
*	float *wptrB, *xinptrA, *xoutptrB, *xoutptrA, p1r, p2r, p1i, p2i;
*	
*	n2 = 1;
*	wptrB = w;
*	xinptrA = x; 
*	xoutptrB = x; 	
*	c=*wptrB++; 
*	s=*wptrB++;
*	xoutptrA = xoutptrB + 2*n2;
*	xoutptrB = 1 + xoutptrB;
*	i = n2;
*	j = n2;
*	p = n2;
*	
*	for(k=n; k > 1; k >>= 1)
*	{
*		for (l=0; l<n/2; l++)
*		{
*			yr = xinptrA[2*n2];
*			yi = xinptrA[2*n2 + 1];
*			xr = *xinptrA++; 
*			xi = *xinptrA++;
*			j = j - 1;
*			i = i - 1;	
*			p = p -1;  
*			itemp = xi - yi;
*			rtemp = xr - yr;
*			Xi = xi + yi;
*			Xr = xr + yr;
*			if (i==0) xinptrA = xinptrA + 2*n2; 
*			p1i = c*itemp;
*			p2i = s*rtemp;
* 			p1r = c*rtemp;
* 			p2r = s*itemp;
*			*xoutptrB = Xi;
*			xoutptrB = xoutptrB + 2;
*			*(xoutptrB - 3) = Xr;		
*			Yi = p1i + p2i;
*			Yr = p1r - p2r;
*			if (p==0) xoutptrB = xoutptrB + 2*n2;
*			m = i; 	
*			if (i==0) i = n2;
*			*xoutptrA++ = Yr;
*			*xoutptrA++ = Yi;
*			if (j==0) xoutptrA = xoutptrA + 2*n2;
*			if (m==0) {
*				c=*wptrB++; 
*				s=*wptrB++;
*			}
*			if (p==0) p = n2;	
* 			if (j==0) j = n2;
*		}
*	
*		n2 = n2 << 1;
*		xinptrA = x;
*		xoutptrB = x;
*		wptrB = w;  	
*		c=*wptrB++; 
*		s=*wptrB++;
*		i = n2;
*		j = n2;
*		p = n2;
*		xoutptrA = xoutptrB + 2*n2;
*		xoutptrB = xoutptrB + 1;	
*	}
* }
*
*
*       NOTATIONS
*
*               f = Function Prolog or Epilog
*               o = Outer Loop
*               p = Inner Loop Prolog
*		e = Inner Loop Epilog
*
*===============================================================================
	;void icfftr2_dif(float* x, float* w, short n)
	; {

	.def _icfftr2_dif
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -