📄 conv2d3x3_spl.asm

📁 ADI BF DSP的YUV到RGB转化汇编优化后的代码
💻 ASM
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification).

By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software. 
********************************************************************************
Module Name     : conv2d3x3_spl.asm
Label name      : __conv2d3x3_spl
Version         : 1.3
Change History  :

                Version   Date            Author        Comments
                1.3       11/18/2002      Swarnalatha   Tested with VDSP++ 3.0
                                                        compiler 6.2.2 on 
                                                        ADSP-21535 Rev.0.2
                1.2       11/13/2002      Swarnalatha   Tested with VDSP++ 3.0
                                                        on ADSP-21535 Rev. 0.2
                1.1       02/12/2002      Nishanth      Modified to match 
                                                        silicon cycle count
                1.0       05/08/2001      Nishanth      Original

Description     : This function does two dimensional circular convolution of a 
                  given sequence with 3 x 3 matrix. Both the MACs are used in 
                  this program.

                  In this implementation circular convolution of two matrices 
                  `a` and `b` is calculated. The dimension of 'a' is row x col 
                  and that of 'b' is 3 x 3. 
                  The dimension of the output matrix c will row x col.

                  The first two columns of outputs are calculated separately as 
                  these ones require circular buffering of each row.

                  The whole implementation is for fract16 input and output. 
                  The format of representation is 1.15 format.
 
Assumptions     : 1. The minimum input matrix size is 3 x 4.
                  2. Number of columns in input matrix should be even.
                  3. in[], mask[] and out[] should be aligned to a 4 byte 
                     boundary.
                  4. in[] and mask[] should be in different minibanks.
                  5. in[] and out[] should be in different minibanks.
                  6. mask[] and out[] should be in different minibanks.
                  7. mask[] and stack should be in different minibanks.

Prototype       : void _conv2d3x3_spl(
                        fract16 in[],
                            // (i) :  Pointer to the input matrix. 
                        short   row,
                            // (i) :  Number of rows of input matrix. 
                        short   col,
                            // (i) :  Number of columns of input matrix. 
                        fract16 mask[],
                            // (i) :  Pointer to 3x3 mask. 
                        fract16 out[])
                            // (o) :  Pointer to the output matrix. 

Registers used  : A0,A1, R0-R3,R7, I0-I2, B0-B2, M0-M3, L0-L2, P0-P1, LC0,LC1

Performance     :
      Code Size   : 288 Bytes.
      Cycle Count : 4.5 * row * col   +   3 * row   +   40
                          (4.5 cycles/pixel in core)
                    1240 cycles for an input matrix size of 16 x 16.
*******************************************************************************/
.section L1_code;
.global __conv2d3x3_spl;
.align 8;
    
__conv2d3x3_spl:
    
    [--SP] = R7;            // Save R7
    P0 = R1;                // Loop counter for rows of input
    P1 = R2;                // P1 = Number of columns
    
    R1 = R1.L * R2.L (ISS2) || R7 = [SP+16];
                            // R7 = Address of Mask
    
    I2 = R7;                // Address of mask
    B2 = R7;                // For circular addressing
    L2 = 20;                // Length of mask = 20 (9 * 2   +   2 extra)
    
    I0 = R0;                // Starting address of input matrix
    B0 = R0;                // For circular addressing
    L0 = R1;                // Circular buffer of length 2 * row * col
    
    R2 = R2 << 1 || R3 = [SP+20] || R7 = [I2++];
                            // 2 * col , Address of output matrix, Modify I2
    M3 = R2;                // M3 = 2 * col
    
    I1 = R3;                // Address of output buffer.
    B1 = R3;
    L1 = R1;                // Circular buffer of length 2 * row * col
    
    
    R0 = 4(Z);
    R1 = R2 << 1 || I0 -= M3 || R7 = [I2--];              
                            // 4 * col, Modify I0 to point to last column, 
                            // Fetch h5,h6
    R1 += -6;
    R0 = R0 - R2(S) || I0 -= 4 || R3 = [I2--];
                            // I0 points to last two columns of last but second 
                            // row, Modify I2
    M1 = R0;                // -2*COL + 4
    M2 = R1;                // 4*COL - 6
    M0 = -6;
    
    I1 -= M3;               // Modify I1(for unrolling)
    
    R1 = [I0++M1] || R3 = [I2--];
                            // Fetch x0,x1(unrolled one) , Fetch h0 and dummy 
                            // location 
    
    LSETUP (FIRST_TWO_COLS_ST, FIRST_TWO_COLS_END) LC0 = P0; 
                            // Loop to keep track of rows of i/p(counter = row)
FIRST_TWO_COLS_ST:
        A0 = R1.L * R3.L, A1 = R1.H * R3.L ||  R1.L = W[I0++] ||  R3 = [I2--]; 
                            // A0 = x0*h0, A1 = x1*h0,  x2  , h1,h2
        A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I0] || I0 += M2;    
                            // A0+= x1*h1, A1 += x2*h1, x3
        A0 += R1.L * R3.L, A1 += R1.H * R3.L || R1 = [I0++M1] || R3 = [I2--];
                            // A0 += x2*h2, A1 += x3*h2,  x10, x11,    h3,h4
    
        A0 += R1.L * R3.H, A1 += R1.H * R3.H ||  R1.L = W[I0++]
        ||  [I1++M3] = R0; 
                            // A0 += x10*h3, A1 += x11*h3,  x12 , Store previous
                            // output
        A0 += R1.H * R3.L, A1 += R1.L * R3.L || R1.H = W[I0] || I0 += M2; 
                            // A0 += x11*h4, A1 += x12*h4,  x13
        A0 += R1.L * R7.H, A1 += R1.H * R7.H || R1 = [I0++M1] ||  I2 -= 4; 
                            // A0 += x12*h5, A1 += x13*h5,  x20,x21 , Modify I2
    
        A0 += R1.L * R7.L, A1 += R1.H * R7.L ||  R1.L = W[I0++] || R3 = [I2--];
                            // A0 += x20*h6, A1 += x21*h6,  x22 ,    h7,h8
        A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I0] || I0 += M0; 
                            // A0 += x21*h7, A1 += x22*h7,  x23
FIRST_TWO_COLS_END:
        R0.L=(A0 += R1.L * R3.L), R0.H=(A1 += R1.H * R3.L) || R1 = [I0++M1]
        || R3 = [I2--]; 
                            // A0 += x22*h8, A1 += x23*h8, x0,x1 , h0,dummy
    
    P1 += -2;               // Col - 2
    
    R1 = M2;                // 4*col - 6
    R1 += 8;
    M1 = R1;                // 4*col + 2
    
    R1 = R1 >> 1 || [I1++M3] = R0 || R2 = [I2--];              
                            // 2*col + 1 , Store first output of last row, 
                            // Fetch h1,h2
    R1 += -7;
    M2 = R1;                // 2*col - 6
    
    M0 = -8;
    
    MNOP || R1 = [I0++];    // Fetch x0,x1
    
    
    LSETUP (ROW_ST,ROW_END1) LC0 = P0; 
                            // Loop to keep track of rows of i/p(counter = row)
ROW_ST: A0 = R1.L * R3.L, A1 = R1.H * R3.L ||  R1.L = W[I0++] || I1 += 4; 
                            // A0 = x0*h0, A1 = x1*h0, x2 , Modify output 
                            // pointer after each row
        LSETUP (COL_ST, COL_END) LC1 = P1 >> 1; 
                            // Loop for all column in a row except first two, 
                            // ctr = (col-2)/2
COL_ST:     A0 += R1.H * R2.H, A1 += R1.L * R2.H || R1.H = W[I0] || I0 += M2;   
                            // A0 += x1*h1, A1 += x2*h1,  x3
            A0 += R1.L * R2.L, A1 += R1.H * R2.L || R1 = [I0++] || R3 = [I2--];
                            // A0 += x2*h2, A1 += x3*h2,  x10, x11 ,  h3,h4
    
            A0 += R1.L * R3.H, A1 += R1.H * R3.H ||  R1.L = W[I0++] ||  I2 -= 4;
                            // A0 += x10*h3, A1 += x11*h3,  x12 , Modify I2
            A0 += R1.H * R3.L, A1 += R1.L * R3.L || R1.H = W[I0] || I0 += M2; 
                            // A0 += x11*h4, A1 += x12*h4,  x13
            A0 += R1.L * R7.H, A1 += R1.H * R7.H || R1 = [I0++]; 
                            // A0 += x12*h5, A1 += x13*h5,  x20,x21
    
            A0 += R1.L * R7.L, A1 += R1.H * R7.L ||  R1.L = W[I0++] 
            || R3 = [I2--];
                            // A0 += x20*h6, A1 += x21*h6,  x22 ,   h7,h8
            A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I0] || I0 -= M1; 
                            // A0 += x21*h7, A1 += x22*h7,  x23
            R0.L=(A0 += R1.L * R3.L), R0.H=(A1 += R1.H * R3.L) || R1 = [I0++] 
            || R3 = [I2++M0]; 
                            // A0 += x22*h8, A1 += x23*h8, x0,x1 ,   h0,dummy
COL_END:    A0 = R1.L * R3.L, A1 = R1.H * R3.L || R1.L = W[I0++] || [I1++] = R0;
                            // A0 = x0*h0, A1 = x1*h0,  x2  , store output
ROW_END1:
        R1.H=W[I0++];       // Fetch x1 for next output, x0 will be in R1.L
    
    R7 = [SP++];            // Restore R7
    RTS;                 
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -