📄 fft512pt.asm
字号:
/************************************************************************ fft512pt.asm Prelim rev. February 25, 2004 - more documentation to come BL This is assembly routine for the complex C-callable 512-point 16-bit FFT on TigerSHARC family of DSPs. I. Description of Calling. 1. Inputs: j4 -> input j5 -> ping_pong_buffer1 j6 -> ping_pong_buffer2 j7 -> output 2. C-Calling Example: fft32(&(input), &(ping_pong_buffer1), &(ping_pong_buffer2), &(output)); 3. Limitations: a. All buffers must be aligned on memory boundary which is a multiple of 4. b. Buffers input.and ping_pong_buffer2 must be aligned on memory boundary which is a multiple of 64. c. If memory space savings are required and input does not have to be preserved, ping_pong_buffer1 can be the same buffer as input with no degradation in performance. d. If memory space savings are required, output can be the same buffer as ping_pong_buffer2 with no degradation in performance. 4. For the code to yield optimal performance, the following must be observed: a. Buffer input must have been cached previously. This is reasonable to assume since any engine that would have brought the data into internal memory, such as a DMA, would also have cached it. b. input and ping_pong_buffer2 must be located in different memory blocks. c. ping_pong_buffer1 and ping_pong_buffer2 must be located in different memory blocks. d. ping_pong_buffer1 and output must be located in different memory blocks. e. twiddles and input must be located in different memory blocks. f. AdjustMatrix and ping_pong_buffer1 must be located in different memory blocks. II. Description of the FFT algorithm. 1. All data is treated as complex packed data. 2. An application note will be provided for the description of the math of the algorithm.************************************************************************///************************* Includes ************************************//#include <defTS201.h>//***********************************************************************.section data2a;.align 4; // allign to quad.var _AdjustMatrix[512] = "MatrixCoeffs.dat";.align 4; // allign to quad.var _twiddles16[64] = "Twiddles16.dat";.var _dummy1[4]; // Loop reads from this location on its exit, does not use the actual value.align 4; // allign to quad.var _twiddles32[16] = "Twiddles32.dat"; // W32_1, W32_2,...,W32_15.var _dummy2[4]; // Loop reads from this location on its exit, does not use the actual value.align 4; // allign to quad.var _k_modifies[8] = 0, 64, -32, 32, 0, 32, -16, 16;.align 4; // allign to quad.var _n_508[4] = 508, 508, 508, 508;//**********************************************************************************************.section program;.global _fft512pt;//************************************** Start of code *****************************************_fft512pt://PROLOGUEJ26 = J27 - 64; K26 = K27 - 64;;J27 = J27 - 28; K27 = K27 - 20;;q[J27 + 24] = XR27:24; q[K27 + 16] = YR27:24;;q[J27 + 20] = XR31:28; q[K27 + 12] = YR31:28;;//**********************************************************************************************// VERTICAL FFTs//**********************************************************************************************//*************************************** Stage 1 **********************************************// From j0->_input to k3->_ping_pong_buffer2 j0=j31+j4; k7:4=Q[k31+_k_modifies];; j1=j0+256; j8=64;; j2=j0+128; k3=j6;; j3=j0+(256+128);; // ---------------------------------- r5:4 =br Q[j2+=64];jL0=508;; //| F1 | | | | | r7:6 =br Q[j3+=j8]; r31=0x80000000;; //| F2 | | | | | r1:0 =br Q[j0+=j8]; kL3:0=Q[k31+_n_508];; //| F3 | | | | | r3:2 =br Q[j1+=j8]; kB3=k3+4; sr13:12=r5:4+r7:6, sr15:14=r5:4-r7:6;; //| F4 | | | AS1 | | // ---------------------------------- r5:4 =br Q[j2+=j8]; kB2=k31+_twiddles16;; //| F1+ | | | | | r7:6 =br Q[j3+=j8]; j9=-92; mr1:0+=r14**r31(CR); sr9:8=r1:0+r3:2, sr11:10=r1:0-r3:2;; //| F2+ | MPY1 | | AS2 | | r1:0 =br Q[j0+=j8]; kL2=k31+64; r24=mr1:0, mr1:0+=r15**r31(CR);; //| F3+ | MPY2 | M1 | | | r3:2 =br Q[j1+=j8]; LC1=8; r25=mr1:0, mr1:0+=r15**r31(CR); sr29:28=r5:4+r7:6, sr15:14=r5:4-r7:6;; //| F4+ | | M2 | AS1+ | | // ---------------------------------- r5:4 =br Q[j2+=j8]; jB0=kB3; sr17:16=r9:8+r13:12, sr21:20=r9:8-r13:12;; //| F1++ | | | AS3 | |.align_code 4; // ----------------------------------_VerFFTStage1Inner: // ---------------------------------- r7:6 =br Q[j3+=j8]; cb Q[k3+=k5]=r17:16; mr1:0+=r14**r31(CR); sr9:8=r1:0+r3:2, sr27:26=r1:0-r3:2;; r1:0 =br Q[j0+=j8]; cb Q[k3+=k6]=r21:20; r24=mr1:0, mr1:0+=r15**r31(CR); sr19:18=r11:10+r25:24, sr23:22=r11:10-r25:24;; r3:2 =br Q[j1+=j8]; cb Q[k3+=k5]=r19:18; r25=mr1:0, mr1:0+=r15**r31(CR); sr13:12=r5:4+r7:6, sr15:14=r5:4-r7:6;; // F4 r5:4 = Q[j2+=j9]; cb Q[k3+=k7]=r23:22; sr17:16=r9:8+r29:28, sr21:20=r9:8-r29:28;; r7:6 = Q[j3+=j9]; cb Q[k3+=k5]=r17:16; mr1:0+=r14**r31(CR); sr9:8=r1:0+r3:2, sr11:10=r1:0-r3:2;; r1:0 = Q[j0+=j9]; cb Q[k3+=k6]=r21:20; r24=mr1:0, mr1:0+=r15**r31(CR); sr19:18=r27:26+r25:24, sr23:22=r27:26-r25:24;; r3:2 = Q[j1+=j9]; cb Q[k3+=k5]=r19:18; r25=mr1:0, mr1:0+=r15**r31(CR); sr29:28=r5:4+r7:6, sr15:14=r5:4-r7:6;; // F4 r5:4 =br Q[j2+=j8]; cb Q[k3+=k7]=r23:22; sr17:16=r9:8+r13:12, sr21:20=r9:8-r13:12;; k2=k31+(_twiddles16+2);; r7:6 =br Q[j3+=j8]; cb Q[k3+=k5]=r17:16; mr1:0+=r14**r31(CR); sr9:8=r1:0+r3:2, sr27:26=r1:0-r3:2;; r1:0 =br Q[j0+=j8]; cb Q[k3+=k6]=r21:20; r24=mr1:0, mr1:0+=r15**r31(CR); sr19:18=r11:10+r25:24, sr23:22=r11:10-r25:24;; r3:2 =br Q[j1+=j8]; cb Q[k3+=k5]=r19:18; r25=mr1:0, mr1:0+=r15**r31(CR); sr13:12=r5:4+r7:6, sr15:14=r5:4-r7:6;; // F4 r5:4 =br Q[j2+=j8]; cb Q[k3+=k7]=r23:22; sr17:16=r9:8+r29:28, sr21:20=r9:8-r29:28;; r7:6 =br Q[j3+=j8]; cb Q[k3+=k5]=r17:16; mr1:0+=r14**r31(CR); sr9:8=r1:0+r3:2, sr11:10=r1:0-r3:2;; r1:0 =br Q[j0+=j8]; cb Q[k3+=k6]=r21:20; r24=mr1:0, mr1:0+=r15**r31(CR); sr19:18=r27:26+r25:24, sr23:22=r27:26-r25:24;; r3:2 =br Q[j1+=j8]; cb Q[k3+=k5]=r19:18; r25=mr1:0, mr1:0+=r15**r31(CR); sr29:28=r5:4+r7:6, sr15:14=r5:4-r7:6;; // F4.align_code 4; if NLC1E, jump _VerFFTStage1Inner; r5:4 =br Q[j2+=j8]; cb Q[k3+=k7]=r23:22; sr17:16=r9:8+r13:12, sr21:20=r9:8-r13:12;;//***************************************** Stage 2 ***********************************************// From j0->_ping_pong_buffer2 to k1->_ping_pong_buffer1.align_code 4; j0=j6+12*32; j1=-4*32;; r7:6 = Q[j0+=-4*32]; r31:30= L[k2+=-2];; //| F1 | | | | | r5:4 =cb Q[j0+=-4*32]; r29:28=cb L[k2+=6]; mr1:0+=r7**r31(CR);; r3:2 = Q[j0+=-4*32]; LC0=15; r15=mr1:0, mr1:0+=r6**r31(CR);; //| F3+ | MPY2 | M1 | | | r1:0 =cb Q[j0+=28*32]; j2=28*32; r14=mr1:0, mr1:0+=r5**r30(CR);; //| F3+ | MPY2 | M1 | | | r7:6 =cb Q[j0+=-4*32]; k1=j5; r13=mr1:0, mr1:0+=r4**r30(CR);; //| F3+ | MPY2 | M1 | | | r5:4 =cb Q[j0+=j1]; r12=mr1:0, mr1:0+=r3**r29(CR);; //| F3+ | MPY2 | M1 | | | r11=mr1:0, mr1:0+=r2**r29(CR);; //| F3+ | MPY2 | M1 | | | r10=mr1:0, mr1:0+=r1**r28(CR);; //| F3+ | MPY2 | M1 | | | r3:2 = Q[j0+=j1]; r9=mr1:0, mr1:0+=r0**r28(CR);; //| F3+ | MPY2 | M1 | | | r1:0 =cb Q[j0+=28*32]; r8=mr1:0, mr1:0+=r7**r31(CR); sr21:20=r13:12+r15:14, sr23:22=r13:12-r15:14;; // AS1 r15=mr1:0, mr1:0+=r6**r31(CR);; kB1=k1+4; r14=mr1:0, mr1:0+=r5**r30(CR); sr17:16=r9:8 +r11:10, sr19:18=r9:8 -r11:10;; // AS2 r7:6 =cb Q[j0+=j1]; r8=r23; r13=mr1:0, mr1:0+=r4**r30(CR); sr9=-r23;; // A2 r5:4 =cb Q[j0+=j1]; r31:30=cb L[k2+=-2];r12=mr1:0, mr1:0+=r3**r29(CR); sr23=-r22;; // A1 r11=mr1:0, mr1:0+=r2**r29(CR); lr9:8=rot r9:8 by -16;; // R2 k6=k31+4*32; r10=mr1:0, mr1:0+=r1**r28(CR); lr23:22=rot r23:22 by -16;; // R1.align_code 4; // ----------------------------------_VerFFTStage2: // ---------------------------------- r3:2 = Q[j0+=j1]; r23=r8; r9=mr1:0, mr1:0+=r0**r28(CR); sr17:16=r17:16+r21:20, sr21:20=r17:16-r21:20;; // AS3 r1:0 =cb Q[j0+=j2]; r29:28=cb L[k2+=6]; r8=mr1:0, mr1:0+=r7**r31(CR); sr25:24=r13:12+r15:14, sr27:26=r13:12-r15:14;; // AS1+ cb Q[k1+=k6]=r17:16; r15=mr1:0, mr1:0+=r6**r31(CR); sr19:18=r19:18+r23:22, sr23:22=r19:18-r23:22;; // AS4 cb Q[k1+=k6]=r19:18; r14=mr1:0, mr1:0+=r5**r30(CR); sr17:16=r9:8 +r11:10, sr19:18=r9:8 -r11:10;; // AS2+ r7:6 =cb Q[j0+=j1]; r8=r27; r13=mr1:0, mr1:0+=r4**r30(CR); sr9=-r27;; // A2+ r5:4 =cb Q[j0+=j1]; r12=mr1:0, mr1:0+=r3**r29(CR); sr27=-r26;; // A1+ cb Q[k1+=k6]=r21:20; r11=mr1:0, mr1:0+=r2**r29(CR); lr9:8=rot r9:8 by -16;; // R2+ cb Q[k1+=k6]=r23:22; r10=mr1:0, mr1:0+=r1**r28(CR); lr27:26=rot r27:26 by -16;; // R1+ r3:2 = Q[j0+=j1]; r27=r8; r9=mr1:0, mr1:0+=r0**r28(CR); sr17:16=r17:16+r25:24, sr25:24=r17:16-r25:24;; // AS3 r1:0 =cb Q[j0+=j2]; r8=mr1:0, mr1:0+=r7**r31(CR); sr21:20=r13:12+r15:14, sr23:22=r13:12-r15:14;; // AS1 cb Q[k1+=k6]=r17:16; r15=mr1:0, mr1:0+=r6**r31(CR); sr19:18=r19:18+r27:26, sr27:26=r19:18-r27:26;; // AS4+ cb Q[k1+=k6]=r19:18; r14=mr1:0, mr1:0+=r5**r30(CR); sr17:16=r9:8 +r11:10, sr19:18=r9:8 -r11:10;; // AS2 r7:6 =cb Q[j0+=j1]; r8=r23; r13=mr1:0, mr1:0+=r4**r30(CR); sr9=-r23;; // A2 r5:4 =cb Q[j0+=j1]; r31:30=cb L[k2+=-2];r12=mr1:0, mr1:0+=r3**r29(CR); sr23=-r22;; // A1 cb Q[k1+=k6]=r25:24; r11=mr1:0, mr1:0+=r2**r29(CR); lr9:8=rot r9:8 by -16;; // R2.align_code 4; // ---------------------------------- if NLC0E, jump _VerFFTStage2; // ---------------------------------- cb Q[k1+=k6]=r27:26; r10=mr1:0, mr1:0+=r1**r28(CR); lr23:22=rot r23:22 by -16;; // R1.align_code 4; jB0=j5+4; r23=r8; r9=mr1:0, mr1:0+=r0**r28(CR); sr17:16=r17:16+r21:20, sr21:20=r17:16-r21:20;; // AS3 j0=j31+j5; j1=j6; r8=mr1:0, mr1:0+=r7**r31(CR); sr25:24=r13:12+r15:14, sr27:26=r13:12-r15:14;; // AS1+ cb Q[k1+=k6]=r17:16; kL2=508; sr19:18=r19:18+r23:22, sr23:22=r19:18-r23:22;; // AS4 cb Q[k1+=k6]=r19:18; kB2=_AdjustMatrix+4; sr17:16=r9:8 +r11:10, sr19:18=r9:8 -r11:10;; // AS2+ cb Q[k1+=k6]=r21:20;; k2=k31+_AdjustMatrix; r8=r27; sr9=-r27;; // A2+ cb Q[k1+=k6]=r23:22; sr27=-r26;; r1:0= Q[j0+=32]; r29:28=cb Q[k2+=32]; lr9:8=rot r9:8 by -16;; // R2+ r3:2= Q[j0+=32]; r31:30=cb Q[k2+=32]; lr27:26=rot r27:26 by -16;; // R1+ r27=r8; sr17:16=r17:16+r25:24, sr25:24=r17:16-r25:24;; // AS3+//*************************************** MPY/Xpose **********************************************// From j0->ping_pong_buffer1 to j1->ping_pong_buffer2 r5:4=cb Q[j0+=32]; r21:20=cb Q[k2+=32]; mr1:0+=r0**r28(CR);; r7:6=cb Q[j0+=32]; r23:22=cb Q[k2+=32]; r8=mr1:0, mr1:0+=r1**r29(CR);; cb Q[k1+=k6]=r17:16; r12=mr1:0, mr1:0+=r2**r30(CR); sr19:18=r19:18+r27:26, sr27:26=r19:18-r27:26;; // AS4+ cb Q[k1+=k6]=r19:18; r9=mr1:0, mr1:0+=r3**r31(CR);; cb Q[k1+=k6]=r25:24; r13=mr1:0, mr1:0+=r4**r20(CR);; cb Q[k1+=k6]=r27:26; LC1=8; r10=mr1:0, mr1:0+=r5**r21(CR);;.align_code 4;_MultXposeLoopOuter: r1:0=cb Q[j0+=32]; r17:16=cb Q[k2+=32]; r14=mr1:0, mr1:0+=r6**r22(CR);; r3:2=cb Q[j0+=32]; r19:18=cb Q[k2+=32]; r11=mr1:0, mr1:0+=r7**r23(CR);; r5:4=cb Q[j0+=32]; r21:20=cb Q[k2+=32]; r15=mr1:0, mr1:0+=r0**r16(CR);; r7:6=cb Q[j0+=32]; r23:22=cb Q[k2+=32]; r24=mr1:0, mr1:0+=r1**r17(CR);;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -