📄 conv2d5x5_gen.asm

📁 ADI BF535 DSP 图象5*5卷积计算汇编优化源码
💻 ASM
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification).

By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software. 
********************************************************************************
Module Name     : conv2d5x5_gen.asm
Label name      : __conv2d5x5_gen
Version         : 1.3
Change History  :
                Version   Date            Author        Comments
                1.3       11/18/2002      Swarnalatha   Tested with VDSP++ 3.0
                                                        compiler 6.2.2 on 
                                                        ADSP-21535 Rev.0.2
                1.2       11/13/2002      Swarnalatha   Tested with VDSP++ 3.0
                                                        on ADSP-21535 Rev. 0.2
                1.1       02/12/2002      Nishanth      Modified to match 
                                                        silicon cycle count
                1.0       05/08/2001      Nishanth      Original

Description     : This function does two dimensional circular convolution of a 
                  given sequence with 5 x 5 matrix.

                  In this implementation circular convolution of two matrices 
                  `a` and `b` is calculated. The dimension of 'a' is row x col 
                  and that of 'b' is 5 x 5. 
                  The dimension of the output matrix c will row x col.

                  The first two columns of outputs are calculated separately as 
                  these ones require circular buffering of each row.

                  The whole implementation is for fract16 input and output. 
                  The format of representation is 1.15 format.
 
Assumptions     : 1. The minimum input matrix size is 5 x 5.
                  2. in[], mask[] and out[] should be aligned to a 2 byte 
                     boundary.
                  3. in[] and stack[] should be in different minibanks.
                  4. mask[] and stack should be in different minibanks.

Prototype       : void _conv2d5x5_gen(
                        fract16 in[],
                            // (i) :  Pointer to the input matrix. 
                        short   row,
                            // (i) :  Number of rows of input matrix. 
                        short   col,
                            // (i) :  Number of columns of input matrix. 
                        fract16 mask[],
                            // (i) :  Pointer to 5x5 mask. 
                        fract16 out[])
                            // (o) :  Pointer to the output matrix. 

Registers used  : A0, R0-R7, I0-I3, B0-B3, M0-M3, L0-L3, P0-P2, LC0,LC1

Performance     :
                Code Size   : 332 Bytes.
                Cycle Count : 28 * row * col   +   90 * row   +   77
                          (28 cycles/pixel in core for all outputs except those 
                           in first four columns.)
                          (46 cycles/pixel for outputs in first four columns 
                          of output.)
                     1227 cycles for an input matrix size of  5 x 5.
                     5177 cycles for an input matrix size of 10 x 15.
                     8685 cycles for an input matrix size of 16 x 16.
******************************************************************************/
.section L1_code;
.global __conv2d5x5_gen;
.align 8;
    
__conv2d5x5_gen:
    
    [--SP] = (R7:4);        // Save R7:4
    
    P2 = R2;                // P2 = Number of columns
    
    B0 = R0;                // B0 stores start address of input matrix.
    I0 = R0;
    R7 = R1.L * R2.L (ISS2) || P0 = [SP+28];
                            // Address of Mask
    L0 = R7;                // Circular buffer of length 2 * row * col
    L3 = R7;
    
    R2 = R2 << 1 || R6 = [SP+32];
                            // 2 * col , Address of output matrix
    L1 = R2;                // Length register is initialized to 2 * col
    
    SP += -60;              // 60 bytes are allocated in stack for storing 
                            // flipped mask
    B2 = SP;                // Address of new(flipped) mask
    I2 = B2;
    L2 = 60;                // Length of mask = 60 (25 + 5 dummy ones to avoid 
                            // exception)
    P0 += 48;               // P0 is made to point to end of original mask
    
    P1 = 5;                 // Loop counter = 5(no: of elements in each row of 
                            // mask)
    R0 = W[P0--](Z);        // Last element is fetched

    LSETUP (REVERSE_ST,REVERSE_END) LC0 = P1;
REVERSE_ST:
        W[I2++] = R0.L || R0 = W[P0--](Z);
                            // Mask is reversed and stored in new array
        W[I2++] = R0.L || R0 = W[P0--](Z);
                            // Mask is reversed and stored in new array
        W[I2++] = R0.L || R0 = W[P0--](Z);
                            // Mask is reversed and stored in new array
        W[I2++] = R0.L || R0 = W[P0--](Z);
                            // Mask is reversed and stored in new array
REVERSE_END:
        [I2++] = R0    || R0 = W[P0--](Z);
                            // Mask is reversed and stored in new array
                            // One dummy space is left at end of each row to 
                            // avoid exception
    
    B3 = R6;
    I3 = R6;                // Address of output buffer.
    M3 = R2;                // M3 = 2 * col
    
    R0 = R2 << 2;              
    R6 = R0 + R2;
    M2 = R6;                // M2 = 10 * col
    
    R0 = R2;
    R0 += -8;
    M0 = R0;                // 2*COL - 8
    
    P0 = 5;                 // Loop counter since there are 5 rows in mask
    P1 = 4;                 // Loop for first four columns of output
    P2 += -4;               // Col - 4
    R5 = 2(Z);
    
    R0 = R0 - R0(S) || I0 += M3;
    
    A0 = 0 || I0 -= M2 || R3 = [I2++];
                            // Accumulator reset, Fetch first two elements
                            // (h0,h1) from mask
                            // Modify I0 so that it points to last but second 
                            // row of input
LOOP_FS_ROW:
    R4 = 8;
    M1 = R4;
    LSETUP (COL_FS_ST, COL_FS_END) LC0 = P1; 
                            // Loop to find all output elements in one row
                            // (counter = col)
COL_FS_ST:
        B1 = I0;            // B1 stores the modified address on rows.
        I1 = B1;            // Address is copied to I1
        LSETUP (CONV_FS_ST, CONV_FS_END) LC1 = P0;
                            // Loop for finding one output(convoluting)
        R4 = R4 - R5(S) || I1 -= M1;
                            // Column offset is added 
CONV_FS_ST: R1.L = W[I1++] || I0 += M3;
                            // Fetch x0 
            A0 += R1.L * R3.L || R1.H = W[I1++]; 
                            // A0 += x0*h0,  x1
            A0 += R1.H * R3.H || R1.L = W[I1++] ||  R3 = [I2++]; 
                            // A0 += x1*h1, x2  ,  h2,h3
            A0 += R1.L * R3.L || R1.H = W[I1++]; 
                            // A0 += x2*h2  x3
            A0 += R1.H * R3.H || R1.L = W[I1++] ||  R3 = [I2++]; 
                            // A0 += x3*h3,  x4 , h4,dummy
            B1 = I0;        // B1 stores the modified address on rows.
            I1 = B1;        // Address is copied to I1
CONV_FS_END:R1.L=(A0+=R1.L * R3.L) || I1 -= M1 || R3 = [I2++]; 
                            // A0 += x4*h4,  Add column offset , h5,h6
        M1 = R4;            // Column offset is updated
COL_FS_END:
        A0 = 0 || I0 -= M2 || W[I3++] = R1.L;
                            // Modify I0, Store the output
    
    R0 = R0 + R2(S) || I3 += M0;
    I0 += M3;               // The counter for LOOP_ROW is modified, I0 and I3 
                            // are modified
    
    CC = R0 < R7;
    If CC JUMP LOOP_FS_ROW (BP);
                            // Jump if all input rows are not over 
    
    M1 = 8;
    MNOP || I3 += M1;       // Modify I3 as the first 4 columns are already 
                            // stored
    
LOOP_ROW:
    LSETUP (COL_ST, COL_END) LC0 = P2; 
                            // Loop to find all output elements in one 
                            // row(counter = col)
COL_ST: MNOP || R1.L = W[I0++];
                            // Fetch x0 
        LSETUP (CONV_ST, CONV_END) LC1 = P0;
                            // Loop for finding one output(convoluting)
CONV_ST:    A0 += R1.L * R3.L || R1.H = W[I0++]; 
                            // A0 += x0*h0,  x1
            A0 += R1.H * R3.H || R1.L = W[I0++] ||  R3 = [I2++]; 
                            // A0 += x1*h1, x2  ,  h2,h3
            A0 += R1.L * R3.L || R1.H = W[I0++] ||  R5 = [I2++]; 
                            // A0 += x2*h2,  x3
            A0 += R1.H * R3.H || R1.L = W[I0] || I0 += M0; 
                            // A0 += x3*h3,  x4 , h4,dummy
CONV_END:   R6.L=(A0+=R1.L * R5.L) || R1.L = W[I0++] || R3 = [I2++]; 
                            // A0 += x4*h4, x10  , h5,h6
COL_END:
        A0 = 0 || I0 -= M2 || W[I3++] = R6.L;
                            // Modify I0, Store the output
    R0 = R0 - R2(S) || I0 += M1;
    I3 += M1;               // The counter for LOOP_ROW is modified, I0 and I3 
                            // are modified
    
    CC = R0 == 0;
    IF !CC JUMP LOOP_ROW (BP);
                            // Jump if all input rows are not over 
    
    SP += 60;               // Restore SP
    (R7:4) = [SP++];        // Restore R7:4
    RTS;                 
                            // Program ends here
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
💿 文件大小 11 K
👤 上传用户 lanyiting1
📂 所属分类 DSP编程
🏷️ 相关标签

#ADI #535 #DSP #BF
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -