r8x8invdct.asm

来自「ADI BF DSP的FFT汇编优化后的代码」· 汇编代码 · 共 323 行
ASM
323 行
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification).

By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software. 
********************************************************************************
Module Name     : r8x8invdct.asm
Label name      :  __r8x8invdct
Version         :   1.3
Change History  :

                Version     Date          Author        Comments
                1.3         11/18/2002    Swarnalatha   Tested with VDSP++ 3.0
                                                        compiler 6.2.2 on 
                                                        ADSP-21535 Rev.0.2
                1.2         11/13/2002    Swarnalatha   Tested with VDSP++3.0
                                                        on ADSP-21535 Rev.0.2
                1.1         02/19/2002    Vijay         Modified to match
                                                        silicon cycle count
                1.0         03/13/2001    Vijay         Original 

Description     : This is the implementation of Chen's algorithm of IDCT.
                  It is based on the separable nature of IDCT for multi-
                  dimension. The input matrix is 8x8 real data. First, one dime-
                  nsional 8-point IDCT is calculated for each of the 8 rows. The
                  output is stored in a separate matrix after transpose. Then 
                  again 8-point IDCT is calculated on each row of matrix. The 
                  output is again stored in a transpose matrix. This is final 
                  output.
                 
                  Chen's algorithm has 4 stages (parts) of implementation.

                  This implementation works only for 8x8 input. The input data 
                  should be real. The range of input should be -256 to 255. 
                 
                  The algorithm is in-placed. 

                  Note : The algorithm reads the input data from the "in" 
                  matrix.
                  First 8-point IDCT will be calculated for all the 8 rows. This
                  output is stored in "temp" buffer in the transposed form at 
                  bit reversed locations. Again the 8-point IDCT is applied on 
                  all the 8 rows of "temp" buffer. Final output computed is 
                  stored in "in" buffer in transposed form at bit reversed 
                  locations. The operation of transposing the matrix and 
                  calculation of bit reversed are carried out while writing the 
                  data without any explicit code.
                         
                  Output of function is provided "in" buffer in normal order.

Prototype       : void _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);

                  *in    -> Pointer to Input vector.
                  *coeff -> Pointer to coefficients.
                  *temp  -> Pointer to temporary data. 

Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M3, L0-L3, P0-P5, LC0, 
                  LC1.

Performance     :
                    Code Size   : 340 Bytes.
                    Cycle Count : 319 Cycles.
******************************************************************************/
.section    L1_code;
.global     __r8x8invdct;
.align      8;
    
__r8x8invdct:

/******************** Function Prologue ***************************************/
    [--SP] = (R7:4, P5:3);  // Pushing the registers on stack.
    B0 = R0;                // Pointer to Input matrix.
    B3 = R1;                // Pointer to Coefficients
    B2 = R2;                // Pointer to Temporary matrix.
    L0 = 0;                 // L registers are initialized to 0
    L1 = 0;                 // --------- do --------
    L2 = 0;                 // --------- do --------
    L3 = 16;                // L3 is used for making coefficients array
                            // circular.
    
    M1 = 16 (X);            // All these registers are initialized for
    M2 = 7 (X);             // modifying the address offsets.
    M3 = 8(X);
    P2 = 16;
    P3 = 32 (X);
    P4 = -110 (X);
    P5 = -62 (X);
    P0 = 2;
    NOP;
    
/*
*   According to Chen's algorithm, first 8-point IDCT will be calculated for all
*   the 8 rows. The output of this calculation is stored in another transpose 
*   matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
*   is stored in matrix transpose form. This is the final output. Therefore,
*   a loop of 2 iteration (IDCT_START, IDCT_END) is set.
*
*   B0 points to the "in" buffer and B2 points to "temp" buffer in the first 
*   iteration. The input is read from "in" buffer and output is written to
*   "temp" buffer. In the second iteration of IDCT_START B0 points to "temp" and
*   B2 points to "in" buffer. The input is read from "temp" buffer and output
*   is written to "in" buffer. "in" buffer holds the final output. 
*/
    
    LSETUP(IDCT_START, IDCT_END) LC0 = P0;
IDCT_START: 
        I0 = B0;            // I0 points to Input Element (0, 0)
        I2 = B0;            // I2 points to Input Element (0, 0)
        I2 += M3 || R3.L = W[I0];
                            // Element 0 is read in R3.L 
        I1 = I2;            // I1 points to input Element (0, 6)
        I1 += 4  || R3.H = W[I2++];
                            // I2 points to input Element (0, 4) 
                            // Element 4 is read in R3.H
        I3 = B3;            // I3 points to Coefficients
        P0 = B2;            // P0 points to array Element (0, 0) for writing 
                            // output
        P1 = B2;
        R7.L = 0x5a82;      // R7.L holds the coefficients C4.
        P1 = P1 + P2;       // P1 points to array element (1, 0) for writing 
                            //output
    
/********************** Implementation of Part 1 ******************************/
    
/*
*   The following operation is done in 2 instructions.
*   A1 = Element 0 * cos(pi/4) 
*   A0 =  Element 0 * cos(pi/4)
*   A1 = A1 - Element 4 * cos(pi/4)
*   A0 = A0 + Element 4 * cos(pi/4)
*   At the same time the value of Element 2 and 6 are read in RH3 and RL3 
respectively.
*/
    
        A1 = R3.L * R7.L, A0 = R3.L * R7.L  || I0 += 4  || R1.L = W[I1++];
        R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) 
        || R1.H = W[I0++] || R7 = [I3++];
    
/*
*   The following two instructions do -
*   A1 = Element 2 * cos(3pi/8) 
*   A0 =  Element 6 * cos(3pi/8)
*   A1 = A1 - Element 6 * cos(pi/8)
*   A0 = A0 + Element 2 * cos(pi/8)
*   R2 reads the input elements ( 5, 3).
*   R7 reads the coefficients value C5 and C3.
*/
    
        A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
        R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--]
        || R7 = [I3++];
    
/*
*   The following three instructions do -
*   A1 = Element 5 * cos(3pi/16) 
*   A0 =  Element 3 * cos(3pi/16)
*   A1 = A1 - Element 3 * cos(5pi/16)
*   A0 = A0 + Element 5 * cos(5pi/16)
*   Element 0 = (Element 0 + Element 6) / 2.
*   Element 4 = (Element 4 + Element 2) / 2.
*   Element 2 = (Element 4 - Element 2) / 2.
*   Element 6 = (Element 0 - Element 6) / 2.
*   The writing W[P0] = R5.L is done for packing purpose. The register locations
*   for element 4 and 6 are swapped.
*/
    
        A1 = R2.H * R7.L, A0 = R2.L * R7.L  || R0.H = W[I0--]   || NOP;
        R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--]  || NOP;
        R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
        || W[P0] = R5.L || R7 = [I3++];
    
/*
*   At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
*   R3 has (4, 0). 
*   Where notation (x, y) means the element from column x is in upper half of 
register
*   and element from column y is in lower half of the register.
*/
    
// The loop for 7 is set. The last iteration is computed separately.
        P2 = M2;
        LSETUP (R0W_START, R0W_END) LC1 = P2;
        P2 = 16;
R0W_START:
    
/****************** Implementation of Part 2 **********************************/
/*
*   The following two instruction does the following job -
*   A1 = Element 1 * cos(7pi/16) 
*   A0 =  Element 7 * cos(7pi/16)
*   A1 = A1 - Element 7 * cos(pi/16)
*   A0 = A0 + Element 1 * cos(pi/16)
*   The read to R1 is dummy.
*   R7 reads the coefficient value C2 and C6.
*/
    
            A1 = R0.H * R7.L, A0 = R0.L * R7.L  || R1 = [I1++M1] 
            || W[P1] = R3.H;
            R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H)  || I0 += M1 
            || R7 = [I3++];
    
/*
*   The following single instructions operates on 4 data as -
*   Element 1 = (Element 1 + Element 5) / 2.
*   Element 5 = (Element 1 - Element 5) / 2.
*   Element 3 = (Element 7 - Element 3) / 2.
*   Element 7 = (Element 7 + Element 3) / 2.
*/
    
            R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0]
            || R1.L = W[I1++];
    
/*
*   At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
*   The The registers R3.H and R5.L are being swapped. 
*/
    
/******************** Implementation of Part 3 ********************************/
/*
*  The following instruction does the following job.
*  Element 0 = Element 0 + Element 7.
*  Element 7 = Element 7 - Element 0.
*  Element 6 = Element 6 + Element 1.
*  Element 1 = Element 6 - Element 1.
*  The elements 0, 1, 6 and 7 are final.
*/
    
            R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];
    
/*
*  The following two instructions do -
*  A1 = Element 3 * cos(pi/4) 
*  A0 =  Element 3 * cos(pi/4)
*  A1 = A1 - Element 5 * cos(pi/4)
*  A0 = A0 + Element 5 * cos(pi/4)
*/
    
            A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
            R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L)
            || R5.L = W[P1] || R1.H = W[I0++];
    
/*
*   At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0) 
*   and R5 has (2, 4). Registers R4 and R0 holds final output.
*/
    
/******************** Implementation of Part 4 ********************************/
/*
*  It is the final stage computation.
*  Element 4 = Element 4 + Element 3.
*  Element 2 = Element 2 + Element 5.
*  Element 5 = Element 5 - Element 2.
*  Element 3 = Element 3 - Element 4.
*/
    
            R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L  
            || R2.L = W[I0--];
                            //R5 = 2, 4 and R6 = 5, 3 final
/*
*   At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4) 
*   and R6 has (5, 3). All the registers hold final output.
*/
    
/**************************** Implementation of Part 1 
**********************************/
/*
*  This is the same part as part 1 specified earlier. First time the part 1 
calculation is
*  done outside the loop, after wards it is done here. It serves two purpose.
*  First it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to 
its bit 
*  reversed order in transpose way. 
*/
            A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H 
            || R2.H = W[I0--];
            R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) 
            || W[P0++P3] = R0.H || R7 = [I3++];
            A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L  
            || R2.H = W[I2--];
            R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) 
            || W[P0++P4] = R0.L || R7 = [I3++];
            A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L 
            || R0.H = W[I0--];
            R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H 
            || R0.L = W[I1--];
            W[P0] = R5.L;
R0W_END:    R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
            || W[P1++P5] = R6.H || R7 = [I3++];
    
/*
*  The computation for 7 rows are over. The last row computation is done here.
*  This is the same part as part 2, 3 and 4 done inside the loop. 
*/
    
        A1 = R0.H * R7.L, A0 = R0.L * R7.L  || NOP  || W[P1] = R3.H; 
        R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++];
        R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0];
        R4 = R3 +|+ R0, R0 = R3 -|- R0;
        A1 = R2.L * R7.L, A0 = R2.L * R7.L;
        R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1];
        R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;
    
        W[P0++P3] = R5.H;   // The last outputs are written here.
        W[P0++P3] = R0.H;
        W[P0++P2] = R6.L;
        W[P0++P4] = R0.L;
        W[P1++P3] = R5.L;
        W[P1++P3] = R4.H;
        W[P1++P5] = R6.H;
    
        B1 = B2;            // The pointers to output and input are swapped
        B2 = B0;            // B0 points to Input buffer
IDCT_END: 
        B0 = B1;            // B2 points to output buffer
    
TERMINATE:
    (R7:4,P5:3)=[SP++];     //Pop the registers before returning.
    RTS;                    //Return.
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
r8x8invdct.asm - 源码说明

本页面展示了「ADI BF DSP的FFT汇编优化后的代码」中的 r8x8invdct.asm 源码文件，采用汇编编程语言编写，共 323 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与DSP相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?