📄 cfftrad2x2non_scaled.asm

📁 ADI BF DSP的FFT汇编优化后的代码
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification).

By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software. 
********************************************************************************
Module Name     : cfftrad2x2non_scaled.asm
Label name      :  __cfftrad2x2non_scaled
Version         :   1.3
Change History  :

                Version     Date          Author        Comments
                1.3         11/18/2002    Swarnalatha   Tested with VDSP++ 3.0
                                                        compiler 6.2.2 on 
                                                        ADSP-21535 Rev.0.2
                1.2         11/13/2002    Swarnalatha   Tested with VDSP++3.0
                                                        on ADSP-21535 Rev.0.2
                1.1         02/19/2002    Vijay         Modified to match
                                                        silicon cycle count
                1.0         03/13/2001    Vijay         Original 

Description  : The Decimation in time two-dimensional FFT has been implemented.
               For efficient implementation the first stage is pulled out since 
               there are no twiddle factor multiplications involved in this
               stage. The bit reversal is done at the input along with the first
               stage computation itself. Since the output of the butterfly at 
               every stage is scaled by four before feeding it to the next stage
               the last stage has to be done separately to avoid the final 
               output being scaled. The two dimensional bit-reversal can be 
               performed efficiently using the 1D bit-reversal instruction as 
               follows.

               1. The 2D input array (NxN) is stored by stretching it into a 1D 
               array of size N^2.
               2. Perform the 1D bit-reversal on this stretched array as usual
               3. The transposed matrix formed out of the 1D bit-reversed array
               is equivalent to a 2D bit-reversal.

               Infact the bit-reversal can be efficiently combined with the 
               first stage computation. The input and output data are complex 
               and are represented in 1Q15 format. The input and the output are 
               in normal order. The twiddle factor array that is passed to the 
               function must have atleast the first (N - 1) elements of an N 
               point twiddle factor.

                Butterfly : The equations of the 2D butterfly that has been 
                implemented is given below:

                X(k1,k2)  = S00(k1,k2) + W(k2)*S01(k1,k2) + 
                            W(k1)*S10(k1,k2) + W(k1 + k2)*S11(k1,k2)
                X(k1 + N/2,k2) = S00(k1,k2) + W(k2)*S01(k1,k2) - 
                                 W(k1)*S10(k1,k2) - W(k1 + k2)*S11(k1,k2)
                X(k1,k2 + N/2) = S00(k1,k2) - W(k2)*S01(k1,k2) + 
                                 W(k1)*S10(k1,k2) - W(k1 + k2)*S11(k1,k2)
                X(k1 + N/2,k2 + N/2) = S00(k1,k2) - W(k2)*S01(k1,k2) - 
                                       W(k1)*S10(k1,k2) + W(k1 + k2)*S11(k1,k2)

                where Sij(k1,k2) is the output of the group (i,j) of the 
                previous stage, W is the 1D twiddle factor array of length N.
 
Assumptions     : 1. The input matrix must be square, i.e., N x N.

                  2. The minimum size of the input matrix is 4 x 4.

                  3. The input buffer is aligned to a 2*N*N byte boundary for 
                     bit-reversal.

                  4. The input, output and the twiddle factor buffers are in 
                     different memory banks.
         
Prototype       : void _cfftrad2x2non_scaled(fract16* in, fract16* out, 
                                             fract16* w2d, int N)

               where
               in  -> the pointer to the input buffer.
               out -> the pointer to the output buffer.
               w2d -> the pointer to the twiddle factor.
               N   -> size of fft. For example if N = 8 then it is an 8 x 8 FFT.

Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M3, L0-L3, P0-P5, LC0, 
                  LC1.

Performance     :
               Code size                     =  760  Bytes.
               Cycle count for 8 x 8 input   =  811  Cycles.
               Cycle count for 16 x 16 input =  3683 Cycles.
*******************************************************************************/
.section L1_code;
.align 8;
.global __cfftrad2x2non_scaled;

__cfftrad2x2non_scaled:

    [--SP] = (R7:4,P5:3);
    L0 = 0;
    L1 = 0;
    SP += -20;
    I0 = R0;                // Input buffer
    B3 = R1;                // Output buffer base address
    I1 = R0;                // Copy input buffer address
    I2 = R1;                // Copy output buffer address
    I3 = R1;                // Copy output buffer address
    R0 = [SP + 60];         // Fetch FFT length (N)
    R1 = R0 << 2 || [SP] = R2;
                            // Address of Twiddle factor array 
    R0 = R0.L * R0.L (IS) || P1 = [SP + 60];
    R3 = R1 >> 1 || I3 += 4;// 4*(N/2)
    M3 = R3;                // Modifier to skip one column of the input buffer 
                            // while bit-reversing
    M1 = R1;                // Modifier to move the output pointer by one row
    R1 += 8;                // 4*(N - 2)
    M2 = R1;                // Modifier that brings the output pointer to the 
                            // beginning of a new column
    R2 = R0 << 1 || I1 += M3 || R4 = [I0];
                            // 2*N^2, Fetch the first input of the 2D butterfly 
    M0 = R2;                // Modifier for 1-D bitreversal
    P2 = P1 >> 1;           // Inner loop counter is N/2 - 1 since one 
                            // butterfly output is computed ...
    P2 += -1;               // outside the outer loop
    B2 = B3;                // Copy base address of the output for circular 
                            // buffering
    R2 = R0 << 2 || I0 += M0 (BREV);
                            // 4*N^2 
    L2 = R2;                // Circular address length
    L3 = R2;                // Circular address length
    
    
/************* BIT REVERSAL & FIRST STAGE **********************************
    The outer loop and the inner loop together does the bit-reversal, input 
scaling 
    followed by the computation of first stage 
*****************************************************************************/
    
    LSETUP(ST_BIT_REV, END_BIT_REV) LC0 = P1 >> 1;
                            // Loop count - N/2 
ST_BIT_REV:
        R4 = R4 >>> 2 (V) || R6 = [I0] || I0 += M0 (BREV);
                            // Scale the input by 4 & fetch the ... 
        R6 = R6 >>> 2 (V) || R5 = [I1] || I1 += M0 (BREV);
                            // next data along with bit-reversal 
        R5 = R5 >>> 2 (V) || R7 = [I1] || I1 += M0 (BREV);
        R7 = R7 >>> 2 (V);  
        R4 = R4 +|+ R5, R5 = R4 -|- R5 (ASR);
                            // Compute the first butterfly of the ... 
        R6 = R6 +|+ R7, R7 = R6 -|- R7 (ASR);
                            // inner loop loop 
        R0 = R4 +|+ R6, R2 = R4 -|- R6 (ASR) || NOP;   

        LSETUP(ST_FIRST_STAGE, END_FIRST_STAGE) LC1 = P2;
                            // Loop count - N/2 - 1 
ST_FIRST_STAGE:
            R1 = R5 +|+ R7, R3 = R5 -|- R7 (ASR) || R4 = [I0] || I0 += M0(BREV);
            R4 = R4 >>> 2 (V) || R6 = [I0] || I0 += M0 (BREV);  
            R6 = R6 >>> 2 (V) || R5 = [I1] || I1 += M0 (BREV);
            R5 = R5 >>> 2 (V) || R7 = [I1] || [I2 ++ M1] = R0;
            R7 = R7 >>> 2 (V) || [I2 ++ M1] = R2  || I1 += M0 (BREV);   
            R4 = R4 +|+ R5, R5 = R4 -|- R5 (ASR) || [I3 ++ M1] = R1;
                            // Compute the remaining ... 
            R6 = R6 +|+ R7, R7 = R6 -|- R7 (ASR) || [I3 ++ M1] = R3;
                            // butterflies along with ... 
END_FIRST_STAGE:
            R0 = R4 +|+ R6, R2 = R4 -|- R6 (ASR);
                            // output scaling 
        R1 = R5 +|+ R7, R3 = R5 -|- R7 (ASR) || [I2 ++ M1] = R0 
        || I1 += M3 (BREV);
        I0 += M3 (BREV) || [I2 ++ M2] = R2;
                            // Modify the input pointers to skip the next column
        DISALGNEXCPT || R4 = [I0] || [I3 ++ M1] = R1;
                            // Store the output of the last ... 
END_BIT_REV:
        I0 += M0 (BREV) || [I3 ++ M2] = R3;
                            // butterfly in the inner loop here 
    L2 = 0;                 // Make the output buffers linear
    L3 = 0;             
    
/*********************** MIDDLE STAGES *****************************************
    The log 2 (N) - 2 stages are computed in this section. The output of every 
stage is scaled by 
    four to avoid overflow at the intermediate stages
*******************************************************************************/
    B2 = 2;                 // Initial value of 2^stage (stage = 1 at this
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -