📄 example 3-36.asm
字号:
; Example 3 - 36. DIT Radix-2 FFT Implementation ASM Listing for the TMS320C64x DSP
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_radix2 -- In-place Radix-2 FFT (Little Endian) *
* *
* REVISION DATE *
* 09-Dec-2002 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_radix2(int n, short *restrict xy, *
* const short *restrict w); *
* *
* n -- FFT size (input) *
* xy[] -- input and output sequences (dim-n) (input/output) *
* w[] -- FFT coefficients (dim-n/2) (input) *
* *
* DESCRIPTION *
* This routine is used to compute FFT of a complex sequece of size *
* n, a power of 2, with "decimation-in-frequency decomposition" *
* method, ie, the output is in bit-reversed order. Each complex *
* value is with interleaved 16-bit real and imaginary parts. To *
* prevent overflow, input samples may have to be scaled by 1/n. *
* *
* void DSP_radix2(int n, short *restrict xy, *
* const short *restrict w) *
* { *
* short n1,n2,ie,ia,i,j,k,l; *
* short xt,yt,c,s; *
* *
* n2 = n; *
* ie = 1; *
* for (k=n; k > 1; k = (k >> 1) ) *
* { *
* n1 = n2; *
* n2 = n2>>1; *
* ia = 0; *
* for (j=0; j < n2; j++) *
* { *
* c = w[2*ia]; *
* s = w[2*ia+1]; *
* ia = ia + ie; *
* for (i=j; i < n; i += n1) *
* { *
* l = i + n2; *
* xt = xy[2*l] - xy[2*i]; *
* xy[2*i] = xy[2*i] + xy[2*l]; *
* yt = xy[2*l+1] - xy[2*i+1]; *
* xy[2*i+1] = xy[2*i+1] + xy[2*l+1]; *
* xy[2*l] = (c*xt + s*yt)>>15; *
* xy[2*l+1] = (c*yt - s*xt)>>15; *
* } *
* } *
* ie = ie<<1; *
* } *
* } *
* *
* ASSUMPTIONS *
* 16 <= n <= 32768 *
* Both input xy and coefficient w must be aligned on word boundary. *
* w coef stored ordered is k*(-cos[0*delta]), k*(-sin[0*delta]), *
* k*(-cos[1*delta]), ... where delta = 2*PI/N, k = 32767 *
* Assembly code is written for processor in Little Endian mode *
* Input xy and coefficients w are 16 bit data. *
* *
* MEMORY NOTE *
* Align xy and w on different word boundaries to minimize *
* memory bank hits. *
* *
* TECHNIQUES *
* 1. Loading input xy as well as coefficient w in word. *
* 2. Both loops j and i shown in the C code are placed in the *
* INNERLOOP of the assembly code. *
* *
* CYCLES *
* cycles = log2(N) * (4*N/2+7) + 34 + N/4. *
* *
* (The N/4 term is due to bank conflicts that occur when xy and w *
* are aligned as suggested above, under "MEMORY NOTE.") *
* *
* For N = 256, cycles = 4250. *
* *
* CODESIZE *
* 800 bytes *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".text:_radix2"
.global _DSP_radix2
_DSP_radix2:
STW .D2T1 A15, *B15--[12] ; push A15 to SP[12]
|| SUB .L1X B15, 4, A15
STW .D1T1 A10, *A15--[10] ; push A10 to SP[11]
|| STW .D2T2 B10, *+B15[10] ; push B10 to SP[10]
|| MVC .S2 CSR, B0
STW .D1T1 A11, *+A15[8] ; push A11 to SP[ 9]
|| STW .D2T2 B11, *+B15[8] ; push B11 to SP[ 8]
|| AND .S2 B0, -2, B1
;-
STW .D1T1 A12, *+A15[6] ; push A12 to SP[ 7]
|| STW .D2T2 B12, *+B15[6] ; push B12 to SP[ 6]
|| MVC .S2 B1, CSR
;== Interrupts disabled here ===
STW .D1T1 A13, *+A15[4] ; push A13 to SP[ 5]
|| STW .D2T2 B13, *+B15[4] ; push B13 to SP[ 4]
STW .D1T1 A14, *+A15[2] ; push A14 to SP[ 3]
|| STW .D2T2 B14, *+B15[2] ; push B14 to SP[ 2]
|| MV .L1X B4,A9 ; &XY
;-
LMBD .L1 1,A4,A1 ; outer loop count calculation
|| MV .L2X A4,B13 ; &N
|| STW .D1T2 B0, *+A15[0] ; push CSR to SP[ 1]
MVK .S1 1,A2 ; IE = 1
|| MV .D2 B13,B10 ; XY index setup
|| MV .L1 A4,A7 ; XY index setup
|| SHL .S2 B13,2,B14 ; calculating reset offset
|| MV .L2X A6,B12 ; permanent ptr for W[0]
;-
SHR .S2 B13,1,B13 ; used for loop count
|| SUB .D2 B14,4,B14 ; calculating reset offset
|| SUB .L1X B14,4,A8 ; calculating reset offset
ADDAH .D1 A9,A7,A3 ; setup ptr for X[i+N2] & Y[i+N2]
|| MV .L2X A9,B9 ; setup ptr for X[i] & Y[i]
|| SUB .D2 B13,4,B13 ; inner loop count
|| MVK .S2 31,B7 ; outer loop count calculation
;-
MV .S2 B9,B4 ; setup store ptr for X[i] & Y[i]
|| MV .L2 B9,B11 ; permanent ptr for X[0] & Y[0]
|| SUB .L1X B7,A1,A1 ; outer loop conter
LDW .D2 *B9++[B10],B7 ; X[i] & Y[i]
|| MV .L2X A2,B2 ; reset twiddle factor counter
|| LDW .D1 *A6++[A2],A5 ; CS = W[IA] & W[IA+1]
|| SHL .S1 A7,1,A0 ; calculating reset offset
;-
[ B2] SUB B2,1,B2 ; decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ; X[i+N2] & Y[i+N2]
[!B2] SUB B9,B14,B9 ; reset load X[i] & Y[i] ptrs
[!B2] SUB A3,A8,A3 ; reset load X[i+N2] & Y[i+N2] ptrs
OUTLOOP:
LDW .D2 *B9++[B10],B7 ;* X[i] & Y[i]
||[!B2] MV .L2X A2,B2 ;* reset twiddle factor counter
||[!B2] LDW .D1 *A6++[A2],A5 ;* CS = W[IA] & W[IA+1]
|| MPY .M2 B2,1,B0 ; move to next iteration
;-
[ B2] SUB B2,1,B2 ;* decrement twiddle factor counter
|| LDW .D1 *A3++[A7],A13 ;* X[i+N2] & Y[i+N2]
SUB2 .S1X A13,B7,A9 ; XYT = X[i]-X[i+N2] & Y[i]-Y[i+N2]
|| MV A5,A10 ; move to other file
|| ADD2 .S2X B7,A13,B7 ; X[i]+X[i+N2] & Y[i]+Y[i+N2]
||[!B2] SUB B9,B14,B9 ;* reset load X[i] & Y[i] ptrs
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -