fft_flp32.asm

来自「ADSP-TS101S and ADSP-TS201S Real and Com」· 汇编代码 · 共 488 行 · 第 1/2 页
ASM
488 行
/*  fft32.asm

	Prelim rev.	October 19, 2003 - BL
  	Rev. 1.0 - added real inputs case - PM

	This is assembly routine for the Complex radix-2 C-callable FFT on TigerSHARC 
	family of DSPs.

	I. Description of Calling.

		1. Inputs:
			j4 -> input (ping-pong buffer 1)
			j5 -> ping-pong buffer 1
      		j6 -> ping-pong buffer 2
			j7 -> output
			j27+0x18 -> N = Number of points
			j27+0x19 -> REAL or COMPLEX

		2. C-Calling Example:
			fft32(&(input), &(ping_pong_buffer1), &(ping_pong_buffer2), &(output), N, COMPLEX);

		3. Limitations:
			a. All buffers must be aligned on memory boundary which is a multiple of 4.
			b. N must be between 32 and MAX_FFT_SIZE.
			c. If memory space savings are required and input does not have to be
			   preserved, ping_pong_buffer1 can be the same buffer as input.
			d. If memory space savings are required, output can be the same buffer
			   as ping_pong_buffer2 if the number of FFT stages is even (i.e. 
			   Log2(N) is even) or the same as ping_pong_buffer1 if the number of
			   FFT stages is odd (i.e. Log2(N) is odd).

		4. MAX_FFT_SIZE can be selected via #define. Larger values allow for more choices
		   of N, but its twiddles will occupy more memory.
    5. This C - callable function can process up to 64K blocks of data on TS201 
        (16K blocks on TS101) because C environment itself necessitates memory. 
        Therefore, if more input points are necessary, assembly language development 
        may become a must. On TS201, a block of memory is 128K words long, so 
        maximum N is 128K real points or 64K complex points. TS101 contains 
        only 2 blocks of data memory of 64K words and 4 buffers must be
        accommodated. Therefore, maximum N is 32K real words or 16K complex words.

	
	II. Description of the FFT algorithm.

		1. The input data is treated as complex interleaved N-point.
		2. Due to re-ordering, no stage can be done in-place.
		3. The bit reversal and the first two stages are combined into
		   a single loop. This loop takes data from input and stores it
		   in the ping-pong buffer1.
		4. Each subsequent stage ping-pongs the data between the two ping-pong
		   buffers. The last stage uses FFT output buffer for its output.
		5. Although the FFT is designed to be called with any point size
		   N <= MAX_FFT_SIZE by subsampling the twiddle factors, for ADSP-TS20x
		   processors, the best cycle optimization is achieved when MAX_FFT_SIZE=N.
		   For ADSP-TS101 all choices of MAX_FFT_SIZE are equally optimal.
		   

	III. Description of the REAL FFT algorithm.

		1. The input data is treated as complex interleaved N/2-point. The N/2 point complex
		   FFT will be computed first. Thus, N is halved, now number of points = N/2.

		2. Details and source code of the N/2 point complex FFT are in II above.

		3. Real re-combine:
			Here the complex N/2-point FFT computed in the previous steps is recombined to
			produce the N-point real FFT. If G is the complex FFT and F is the real FFT,
			the formula for F is given by:

			F(n) = 0.5*(G(n)+conj(G(N/2-n))-0.5*i*exp(-2*pi*i*n/N)*(G(n)-conj(G(N/2-n)). 
			
			From this the following can be derived:

			conj(F(N/2-n)) = 0.5*(G(n)+conj(G(N/2-n))+0.5*i*exp(-2*pi*i*n/N)*(G(n)-conj(G(N/2-n)).

			Thus, this can be computed in (n,N/2-n) pairs, as follows (dropping factor of 2):

			G(n) ------------------------------->------------------------>--------> F(n)
                                            \ +/                     \ +/
                                             \/                       \/
                                             /\                       /\
                      conj                  / -\  exp(-2*pi*i*n)*i   / -\   conj
			G(N/2-n) -----> conj(G(N/2-n))------>------------------------>--------> F(N/2-n)

			This is very efficient on the TigerSHARC architecture due to the add/subtract
			instruction.

	
	IV. For all additional details regarding this algorithm and code, see EE-218
	    application note, available from the ADI web site.		   

*/
//************************************ Includes **********************************

#include "FFTDef.h"
#include "defts201.h"

//************************* Externs *************************************

.extern _twiddles;

//********************************* FFT Routine *********************************
.section program;
.global _FFT32;

_FFT32:

//********************************** Prologue ***********************************

	mENTER
    mPUSHQ(xR31:28)
	mPUSHQ(xR27:24)
	mPUSHQ(yR31:28)
	mPUSHQ(yR27:24)
	
//************************************ Setup *************************************
  j17 = [j27 + 0x18];;                    //j17 = N
  j11 = [j27 + 0x19];;                    // j11=COMPLEX or REAL, off the stack

	comp(j11,COMPLEX);;												// Complex or Real?
	if jeq, jump _FFTStages1and2;;
	j17=ashiftr j17;;													// if Real, half N
//********************************************************************************
_FFTStages1and2:

  j11 = j31 + j17;;                                  // j11=N

	xr3=j11; k7=k31+_twiddles;;									
	k1=j11; j8=lshiftr j11;;														// k1=N, j8=N/2
	j9=lshiftr j8; xr0=MAX_FFT_SIZE; xr3=LD0 r3;;								// j9=N/4, compute the twiddle stride
	k8=lshiftr k1; xr0=LD0 r0; xr1=j11;;
	k8=lshiftr k8; xr1=LD0 r1; xr2=(31-3);;										// k8=N/4, Compute Stages-3
	k0=j4; k10=lshiftr k8; xr1=r1-r0; xr0=lshift r0 by -32;;					// k0->input, xr1=bit difference between MAX and N
	k10=lshiftr k10; xr0=bset r0 by r1; xr30=r2-r3;;							// k10=N/16, xr30=Stages-3
	k10=k10-1; xr0=lshift r0 by 2; LC1=xr30;;									// k10=N/16-1, LC1=Stages-3
	k9=xr0; k4=k31+(MAX_FFT_SIZE/4-1);;
	k4=not k4; j10=lshiftr j9;;													// initial twiddles pointer mask, j10=N/8

//****************** Bit Reverse and Stages 1 & 2 ******************************

	k5=lshiftr k1;;																// k5=N/2
	j0=j31+j6; k6=k6-k6;;														// j0->ping_pong_buffer2
	j1=j0+j9; LC0=k10;;															// j1->ping_pong_buffer2+N/4, LC0=N/16-1
	j2=j1+j9; k1=k0+k5;;														// j2->ping_pong_buffer2+N/2, k1->input+N/2
	j3=j2+j9; k2=k1+k5;;														// j3->ping_pong_buffer2+3N/4, k2->input+N
	j12=j3+j9; k3=k2+k5;;														// j12->ping_pong_buffer2+N, k3->input+3N/2
	j13=j12+j9; k5=lshiftr k5;;													// j13->ping_pong_buffer2+5N/4, k5=N/4
	j14=j13+j9; r1:0=q[k0+k6];;													// j14->ping_pong_buffer2+3N/2
	j15=j14+j9; r3:2=q[k2+k6];;													// j15->ping_pong_buffer2+7N/4
	
	r5:4=q[k1+k6];;
	r7:6=q[k3+k6];;
	
	k6=k6+k5 (br); fr0=r0+r2, fr20=r0-r2;;
	r9:8=q[k0+k6]; fr2=r1+r3, fr29=r1-r3;;
	r11:10=q[k2+k6]; fr4=r4+r6, fr21=r4-r6;;
	r13:12=q[k1+k6]; fr5=r5+r7, fr28=r5-r7;;
		
	r15:14=q[k3+k6]; fr18=r8+r10, fr22=r8-r10;;			
	k6=k6+k5 (br); fr19=r9+r11, fr31=r9-r11;;			
	fr26=r12+r14, fr23=r12-r14;;						
	fr27=r13+r15, fr30=r13-r15;;						
		
	fr20=r20+r28, fr28=r20-r28;;						
	fr29=r29+r21, fr21=r29-r21;;						
	fr22=r22+r30, fr30=r22-r30;;						
	fr31=r31+r23, fr23=r31-r23;;						
	
.align_code 4;
_Stages1and2Loop:
		r1:0=q[k0+k6]; q[j2+=4]=yr23:20; fr16=r0+r4, fr24=r0-r4;;
		r3:2=q[k2+k6]; q[j3+=4]=xr23:20; fr17=r2+r5, fr25=r2-r5;;
		r5:4=q[k1+k6]; q[j14+=4]=yr31:28; fr18=r18+r26, fr26=r18-r26;;
		r7:6=q[k3+k6]; q[j15+=4]=xr31:28; fr19=r19+r27, fr27=r19-r27;;
		
		k6=k6+k5 (br); q[j0+=4]=yr19:16; fr0=r0+r2, fr20=r0-r2;;
		r9:8=q[k0+k6]; q[j1+=4]=xr19:16; fr2=r1+r3, fr29=r1-r3;;
		r11:10=q[k2+k6]; q[j12+=4]=yr27:24; fr4=r4+r6, fr21=r4-r6;;
		r13:12=q[k1+k6]; q[j13+=4]=xr27:24; fr5=r5+r7, fr28=r5-r7;;
		
		r15:14=q[k3+k6]; fr18=r8+r10, fr22=r8-r10;;			
		k6=k6+k5 (br); fr19=r9+r11, fr31=r9-r11;;							
		fr26=r12+r14, fr23=r12-r14;;						
		fr27=r13+r15, fr30=r13-r15;;						
		
		fr20=r20+r28, fr28=r20-r28;;						
		fr29=r29+r21, fr21=r29-r21;;						
		fr22=r22+r30, fr30=r22-r30;;						
.align_code 4;
		if NLC0E, jump _Stages1and2Loop;
		fr31=r31+r23, fr23=r31-r23;;						
		
	q[j2+=4]=yr23:20; fr16=r0+r4, fr24=r0-r4;;
	q[j3+=4]=xr23:20; fr17=r2+r5, fr25=r2-r5;;
	q[j14+=4]=yr31:28; fr18=r18+r26, fr26=r18-r26;;
	q[j15+=4]=xr31:28; fr19=r19+r27, fr27=r19-r27;;
		
	q[j0+=4]=yr19:16;;
	q[j1+=4]=xr19:16;;
	q[j12+=4]=yr27:24;;
	q[j13+=4]=xr27:24;;

//************************ Stages 3 to Log2(N)-1  *******************************

	j0=j31+j6; k5=k31+0;;

.align_code 4;
_StageLoop:	
		yr3:0=q[j0+=4];   k3=k5 and k4;;                                            		// F1,    K1
		xr3:0=q[j0+=4];   r5:4=l[k7+k3];;                                           		// F2,    K2
		LC0=k10;          k5=k5+k9;;                                                		//        K3,    M1

		yr11:8=q[j0+=4];  k3=k5 and k4;    fr6=r2*r4;;                                      // F1+,   K1+
		xr11:8=q[j0+=4];  r13:12=l[k7+k3]; fr7=r3*r5;;                              		// F2+,   K2+,   M2
                             			   fr14=r2*r5;;                             		//               M3
		j1=j31+j5;        k5=k5+k9;        fr6=r10*r12;  fr16=r6-r7;;               		//        K3+,   M1+,   A1
			
		yr23:20=q[j0+=4]; k3=k5 and k4;    fr15=r3*r4;;                             		// F1++,  K1++,  M4
		xr23:20=q[j0+=4]; r5:4=l[k7+k3];   fr7=r11*r13;;                            		// F2++,  K2++,  M2+
		                                   fr14=r10*r13; fr17=r14+r15;;             		//               M3+,   A2
		j2=j1+j11;         k5=k5+k9;        fr6=r22*r4;   fr18=r6-r7;;               		//        K3++,  M1++,  A1+

		yr31:28=q[j0+=4]; k3=k5 and k4;    fr15=r11*r12; fr24=r0+r16, fr26=r0-r16;; 		// F1+++, K1+++, M4+,   A3
		xr31:28=q[j0+=4]; r13:12=l[k7+k3]; fr7=r23*r5;   fr25=r1+r17, fr27=r1-r17;; 		// F2+++, K2+++, M2++,  A4
        q[j1+=4]=r25:24;                   fr14=r22*r5;  fr19=r14+r15;;             		// S1,           M3++,  A2+
		
			
.align_code 4;
_BflyLoop:	
			q[j2+=4]=r27:26;  k5=k5+k9;        fr6=r30*r12;  fr16=r6-r7;;                 	// S2----,K3-,   M1-,   A1--

			yr3:0=q[j0+=4];   k3=k5 and k4;    fr15=r23*r4;  fr24=r8+r18,  fr26=r8-r18;;  	// F1,    K1,    M4--,  A3---
			xr3:0=q[j0+=4];   r5:4=l[k7+k3];   fr7=r31*r13;  fr25=r9+r19,  fr27=r9-r19;;  	// F2,    K2,    M2-,   A4---
			q[j1+=4]=r25:24;                   fr14=r30*r13; fr17=r14+r15;;               	// S1---,        M3-,   A2--
			q[j2+=4]=r27:26;  k5=k5+k9;        fr6=r2*r4;    fr18=r6-r7;;                 	// S2---, K3,    M1,    A1-

			yr11:8=q[j0+=4];  k3=k5 and k4;    fr15=r31*r12; fr24=r20+r16, fr26=r20-r16;; 	// F1+,   K1+,   M4-,   A3--
			xr11:8=q[j0+=4];  r13:12=l[k7+k3]; fr7=r3*r5;    fr25=r21+r17, fr27=r21-r17;; 	// F2+,   K2+,   M2,    A4--
            q[j1+=4]=r25:24;       	           fr14=r2*r5;   fr19=r14+r15;;               	// S1--,         M3,    A2-
			q[j2+=4]=r27:26;  k5=k5+k9;        fr6=r10*r12;  fr16=r6-r7;;                 	// S2--,  K3+,   M1+,   A1
			
			yr23:20=q[j0+=4]; k3=k5 and k4;    fr15=r3*r4;   fr24=r28+r18, fr26=r28-r18;; 	// F1++,  K1++,  M4,    A3-
			xr23:20=q[j0+=4]; r5:4=l[k7+k3];   fr7=r11*r13;  fr25=r29+r19, fr27=r29-r19;; 	// F2++,  K2++,  M2+,   A4-
			q[j1+=4]=r25:24;                   fr14=r10*r13; fr17=r14+r15;;               	// S1-,          M3+,   A2
fft_flp32.asm - 源码说明

本页面展示了「ADSP-TS101S and ADSP-TS201S Real and Complex radix-2 C-callable FFT」中的 fft_flp32.asm 源码文件，采用汇编编程语言编写，共 488 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ADSP-TS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?