📄 imdct36.asm

📁 在ADI的Blackfin系列DSP上编写的语音＆音频程序
💻 ASM
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification).
    By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software. 
********************************************************************************
Module Name     : imdct36.asm
Label name      :  __imdct36
Version         :   1.3

Change History  :

                Version     Date          Author      Comments
                1.3         11/18/2002    Swarnalatha Tested with VDSP++ 3.0
                                                      compiler 6.2.2 on 
                                                      ADSP-21535 Rev.0.2
                1.2         11/13/2002    Swarnalatha Tested with VDSP++ 3.0
                                                      on ADSP-21535 Rev. 0.2
                1.1         01/22/2002    Vijay       Modified to match silicon
                                                      cycle count
                1.0         07/17/2001    Vijay       Original 

Description     : This asm routine is a 16-bit implementation of the 36-point 
                  inverse modified discrete cosine transform (IMDCT) which is
                  used in the MPEG Layer III audio codec. The equation of the
                  IMDCT for a sequence X of length N/2 is given below:

                    y(n)  = (2/N)*sum 0 to N/2-1 { X(k)cos(pi*(2k + 1)*(2n + 1 
                            + N/2)/2N)
                    k = 0 to N/2-1
                    n = 0 to N-1

                  Note : The MDCT exhibits antisymmetry. So the first eighteen
                  MDCT values are enough to compute the 36 point IMDCT
                  The algorithm used to implement the IMDCT is given as a
                  pseudocode below :

                    First stage computations :
                        X = sqrt(2)*inp;
                        y1(0) =  X(0);            y2(1) = -X(17);
                        y1(1) = -(X(2) + X(1));   y2(9) = X(2) - X(1);
                        y1(2) =  X(3) + X(4);     y2(8) = X(3) - X(4);
                        y1(3) = -(X(6) + X(5));   y2(7) = X(6) - X(5);
                        y1(4) =  X(7) + X(8);     y2(6) = X(7) - X(8);
                        y1(5) = -(X(10) + X(9));  y2(5) = X(10) - X(9);
                        y1(6) =  X(11) + X(12);   y2(4) = X(11) - X(12);
                        y1(7) = -(X(14) + X(13)); y2(3) = X(14) - X(13);
                        y1(8) =  X(15) + X(16);   y2(2) = X(15) - X(16);

                    9 Point IDCT : 
                        WC = cos(pi/18*(2*(0:8)'+1)*(0:8));
                        a(9:-1:1) = WC*y1';

                    9 Point IDST : 
                        WS = sin(pi/18*(2*(0:8)'+1)*((0:8)+1)); 
                        b = WS*[y2(9:-1:1)]';

                    Twiddle factor multiplications :
                        W1 = cos(pi/72*(17:-2:1));
                        W2 = sin(pi/72*(17:-2:1));
                        for i=0:8
                            c(i) = a(i)*W1(i) + b(8-i)*W2(i);
                            d(8-i) = -a(i)*W2(i) + b(8-i)*W1(i);
                        end

                    Last stage :
                        for i=0:8
                            x(i)    = (d(i) + c(8-i));
                            x(17-i) = -x(i);
                            x(35-i) = (d(i) - c(8-i));
                            x(i+18) = x(35-i);
                        end
                    x = x/36;

Prototype       : void _imdct36(fract16 *input, fract16 *output, fract16
                                *twid_coef);

Assumptions     : The input buffer is assumed to be aligned to a 4 byte boundary
                  The length of the input and output buffers are always 18 and
                  36, respectively. The twid_factor array that is passed to the
                  routine must have 9x9 cosine and sine coefficients interleaved
                  in the following manner :

                    i = 0;
                    for n = 0 to 8
                        for k = 0 to 8
                            twid_coef[i]   = cos(PI*k*(2*n + 1)/18);
                            twid_coef[i+1] = sin(PI*(k + 1)*(2*n + 1)/18);
                            i = i + 2;
                        end
                    end

                    Note : The index of the outer loop and inner loop should not
                    be interchanged and the twid_coef array is of type fract16
                    (short)

Registers Used  : A0, A1, R0-R3, R6, R7, I0-I3, B2, M0, L0-L3, P0-P2, P5.

Performance     :
                    Code size       :     428 bytes
                    Cycle count     :     258 cycles
*******************************************************************************/
.section L1_code;
.align 8;
.global __imdct36;

__imdct36:

    [--SP] = (R7:6, P5:5);
    SP += -40;
    L0 = 0;
    L1 = 0;
    L2 = 0;
    L3 = 0;
    I0 = R0;                // Input pointer
    P2 = R1;                // Output pointer
    I3 = R2;                // Twiddle factor array
    I1 = SP;                // Temporary storage in stack
    I2 = SP;
    M0 = -32 (X);
    R3.L = 0x141D;
    P0 = 9 (Z);             // Loop count
    R0 = [I0++] || I1 -= 4; // Scale the input data by sqrt(2)/9

    LSETUP(ST_SCALE, ST_SCALE) LC0 = P0;
ST_SCALE:
        R1.H = R0.H * R3.L, R1.L = R0.L * R3.L || R0 = [I0++] || [I1++]= R1;
/*************************** FIRST STAGE **************************************/
    R3 = -R1 (V) || [I1 ++ M0] = R1;
    R0 = [I1++];
    R0.L = R0.L >>> 1 || R1 = [I1++];
    R3 = R3 >>> 1 (V) || R2 = [I1++] || I2 -= 4;

    LSETUP(ST_FIRST_STAGE, END_FIRST_STAGE) LC0 = P0 >> 1;
ST_FIRST_STAGE:
        R6 = PACK(R2.L, R0.H) || [I2--] = R7;
        R6 = R1 +|+ R6, R7 = R1 -|- R6 (ASR) || R1 = [I1++];
        R0 = PACK(R7.L, R0.L);
        R7 = PACK(R7.H, R6.L) || [I2--] = R0;
END_FIRST_STAGE:
        R0 = PACK(R2.H, R6.H) || R2 = [I1++];
    R3 = PACK(R3.H, R6.H) || [I2--] = R7 || I1 += M0;
    [I2] = R3 || I1 -= 4;                   
/****************************** 9 POINT IDCT AND IDST ************************/
    B2 = I2;                // Base address of the input buffer of IDCT & IDST
    L2 = 36;                // Make the IDCT/IDST input array circular
    R0 = [I1--] || I2 -= M0;// Dummy fetch to decrement I1
    R1 = [I2--] || I1 -= 4; // First data of IDCT/IDST
    R3 = [I3++] ;           // First coefficient from the twid_coef array
    MNOP;                   //to remove IAU stall

    LSETUP(ST_OUT_LOOP, END_OUT_LOOP) LC1 = P0;
ST_OUT_LOOP:
        A1 = R1.H * R3.H, A0 = R1.L * R3.L || R1 = [I2--] || R3 = [I3++];
                            // IDCT in A0 and IDST in A1
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
        R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
        || R3 = [I3++];
END_OUT_LOOP:
        [I1++] = R2;        // Store y1(i) and y2(8-i)  
    L2 = 0;                 // Restore length register
/************************** PUSH COEFFICIENTS IN STACK ************************/
    R0.L = 0x3FF0;
    R0.H = 0x02CB;
    [I2--] = R0;
    R0.L = 0x3F74;
    R0.H = 0x085B;
    [I2--] = R0;
    R0.L = 0x3E7C;
    R0.H = 0x0DDA;
    [I2--] = R0;
    R0.L = 0x3D0A;
    R0.H = 0x133F;
    [I2--] = R0;
    R0.L = 0x3B21;          // Push W1(i) = .5*cos((17:-2:1)*pi/72 and
    R0.H = 0x187E;          // W2(i) = .5*sin((17:-2:1)*pi/72 into stack
    [I2--] = R0;
    R0.L = 0x38C5;
    R0.H = 0x1D8D;
    [I2--] = R0;
    R0.L = 0x35FA;
    R0.H = 0x2263;
    [I2--] = R0;
    R0.L = 0x32C6;
    R0.H = 0x26F6;
    [I2] = R0 || I1 -= 4;
    R0.L = 0x2F30;
    R0.H = 0x2B3D;
/********************** TWIDDLE FACTOR MULTIPLICATION *************************/
    LSETUP(ST_TWID_MULT, END_TWID_MULT) LC0 = P0;
                            // c(i) = a(i)*W1(i) + b(8-i)*W2(i);
ST_TWID_MULT:
        A1 = R2.H * R0.L, A0 = R2.H * R0.H || [I1--] = R1;
                            // d(8-i) = -a(i)*W2(i) + b(8-i)*W1(i);
        R1.H = (A1 -= R2.L * R0.H), R1.L = (A0 += R2.L * R0.L) || R2 = [I1];
END_TWID_MULT:
        R0 = [I2++];
/****************************** OUTPUT COPY ***********************************/
    P5 = P2;
    P5 += 36;
    P1 = P5;
    P1 += -2;
    R2.L = R1.H + R1.L (S) || I1 += 4;

    LSETUP(ST_OUT_COPY, END_OUT_COPY) LC0 = P0;
    P0 = P5;
    P0 += 34;
ST_OUT_COPY:
        R3.L = R1.H - R1.L (S) || W[P2++] = R2;
                            // Store x(0 to 8)
        R2 = -R2 (V) || W[P5++] = R3 || R1 = [I1++];
                            // Store x(18 to 26)
        R2.L = R1.H + R1.L (S) || W[P1--] = R2;
                            // Store x(17 to 9)
END_OUT_COPY:
	
    
     
    W[P0--] = R3;       // Store x(35 to 27)
   
     SP += 40;               // Restore stack pointer
    (R7:6, P5:5) = [SP++];  // Retrieve call preserved registers
      
    
    RTS;
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
__imdct36.end:
💿 文件大小 467 K
👤 上传用户 xuhuizi
📂 所属分类 DSP编程
🏷️ 相关标签

#Blackfin #ADI #DSP #编写
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -