📄 conv2d5x5_spl.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : conv2d5x5_spl.asm
Label name : __conv2d5x5_spl
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev. 0.2
1.1 02/12/2002 Nishanth Modified to match
silicon cycle count
1.0 05/08/2001 Nishanth Original
Description : This function does two dimensional circular convolution of a
given sequence with 5 x 5 matrix. Both the MACs are used in
this program.
In this implementation circular convolution of two matrices
`a` and `b` is calculated. The dimension of 'a' is row x col
and that of 'b' is 5 x 5.
The dimension of the output matrix c will row x col.
The first four columns of outputs are calculated separately as
these ones require circular buffering of each row.
The whole implementation is for fract16 input and output.
The format of representation is 1.15 format.
Assumptions : 1. The minimum input matrix size is 5 x 6.
2. in[] and out[] should be aligned to a 4 byte boundary.
3. mask[] should be aligned to a 2 byte boundary.
4. in[] and mask[] should be in different minibanks.
Prototype : void _conv2d5x5_spl(
fract16 in[],
// (i) : Pointer to the input matrix.
short row,
// (i) : Number of rows of input matrix.
short col,
// (i) : Number of columns of input matrix.
fract16 mask[],
// (i) : Pointer to 5x5 mask.
fract16 out[])
// (o) : Pointer to the output matrix.
Registers used : A0, A1, R0-R3, R6-R7, I0-I3, B0-B3, M0-M3, L0-L3, P0-P2, LC0,
LC1.
Performance :
Code Size : 272 Bytes.
Cycle Count : 14 * row * col + 51 * row + 41
(14 cycles/pixel in core for all outputs except those
in first four columns.)
(23 cycles/pixel for outputs in first four columns of
output.)
4537 cycles for an input matrix size of 16 x 16.
*******************************************************************************/
.section L1_code;
.global __conv2d5x5_spl;
.align 8;
__conv2d5x5_spl:
[--SP] = (R7:6); // Save R7:6
P0 = 5; // Loop counter since there are 5 rows in mask
P1 = 2;
P2 = R2; // P2 = Number of columns
I0 = R0; // Start address of input matrix.
B0 = R0; // Base address of circular buffer
R7 = R1.L * R2.L (ISS2) || R3 = [SP+20];
// Address of Mask
L0 = R7; // Circular buffer of length 2 * row * col
R2 = R2 << 1 || R6 = [SP+24];
// 2 * col , Address of output matrix
M3 = R2; // 2 * col
L1 = R2; // Length register is initialized to 2 * col
I2 = R3; // Starting address of mask
B2 = R3; // Base address of circular buffer
L2 = 50; // Length of mask = 50 (25 * 2)
I3 = R6; // Address of output buffer.
B3 = R6; // Base address of circular buffer
L3 = R7; // Circular buffer of length 2 * row * col
P2 += -4; // Col - 4
R0 = R2 << 2;
R6 = R0 + R2(S) || NOP;
M2 = R6; // 10 * col
R0 = R2;
R0 += -8;
M0 = R0; // 2*COL - 8
M1 = 8;
R0 = R0 - R0(S) || I0 += M3 || R3.L = W[I2--];
// R0 = 0, Modify I0, Make I2 point to end of mask
A1 = A0 = 0 || I0 -= M2 || R3.L = W[I2--];
// Accumulator reset, Fetch first element(h0)
// from mask
// Modify I0 so that it points to last but second
// row of input
LOOP_FS_ROW:
LSETUP (COL_FS_ST, COL_FS_END) LC0 = P1;
// Loop to find all output elements in one
// row(counter = col)
COL_FS_ST:
B1 = I0; // B1 stores the modified address on rows.
I1 = B1; // Address is copied to I1
I1 -= M1; // Column offset is added
LSETUP (CONV_FS_ST, CONV_FS_END) LC1 = P0;
// Loop for finding one output(convoluting)
CONV_FS_ST: R1 = [I1++] || I0 += M3;
// Fetch x0,x1
A0 += R1.L * R3.L ,A1 += R1.H * R3.L || R1.L = W[I1++]
|| R3.H = W[I2--];
// A0 += x0*h0, A1 += x1*h0, x2 , h1
A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I1++]
|| R3.L = W[I2--];
// A0 += x1*h1, A1 += x2*h1, x3 , h2
A0 += R1.L * R3.L, A1 += R1.H * R3.L || R1.L = W[I1++]
|| R3.H = W[I2--];
// A0 += x2*h2, A1 += x3*h2, x4 , h3
A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I1++]
|| R3.L = W[I2--];
// A0 += x3*h3, A1 += x4*h3, x4,x5 , h4
B1 = I0; // B1 stores the modified address on rows.
I1 = B1; // Address is copied to I1
CONV_FS_END:R1.L=(A0+=R1.L * R3.L),R1.H=(A1+=R1.H * R3.L) || I1 -= M1
|| R3.L = W[I2--];
// A0 += x4*h4, A1 += x5*h4, Add column offset, h10
A1 = A0 = 0 || I0 -= M2 || [I3++] = R1;
// Modify I0, Store the output
COL_FS_END:
M1 = 4; // Column offset for 3rd and 4th columns
M1 = 8; // Column offset for Ist two columns
R0 = R0 + R2(S) || I3 += M0 || R1 = [I0++M3];
// The counter for LOOP_ROW is modified, I0 and I3
// are modified
CC = R0 < R7;
If CC JUMP LOOP_FS_ROW (BP);
// Jump if all input rows are not over
MNOP || I3 += M1; // Modify I3 as the first 4 columns are already
// stored
LOOP_ROW:
LSETUP (COL_ST, COL_END) LC0 = P2 >> 1;
// Loop to find all output elements in one
// row(counter = col)
COL_ST: MNOP || R1 = [I0++];// Fetch x0,x1
LSETUP (CONV_ST, CONV_END) LC1 = P0;
// Loop for finding one output(convoluting)
CONV_ST: A0 += R1.L * R3.L ,A1 += R1.H * R3.L || R1.L = W[I0++]
|| R3.H = W[I2--];
// A0 += x0*h0, A1 += x1*h0, x2 , h1
A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1.H = W[I0++]
|| R3.L = W[I2--];
// A0 += x1*h1, A1 += x2*h1, x3 , h2
A0 += R1.L * R3.L, A1 += R1.H * R3.L || R1.L = W[I0]
|| R3.H = W[I2--];
// A0 += x2*h2, A1 += x3*h2, x4 , h3
A0 += R1.H * R3.H, A1 += R1.L * R3.H || R1 = [I0++M0]
|| R3.L = W[I2--];
// A0 += x3*h3, A1 += x4*h3, x4,x5 , h4
CONV_END: R6.L=(A0+=R1.L * R3.L),R6.H=(A1+=R1.H * R3.L) || R1 = [I0++]
|| R3.L = W[I2--];
// A0 += x4*h4, A1 += x5*h4, x6,x7 , h10
COL_END:
A1 = A0 = 0 || I0 -= M2 || [I3++] = R6;
// Modify I0, Store the output
R0 = R0 - R2(S) || R6 = [I0++M1] || I3 += M1;
// The counter for LOOP_ROW is modified, I0 and I3
// are modified
CC = R0 == 0;
If !CC JUMP LOOP_ROW (BP);
// Jump if all input rows are not over
(R7:6) = [SP++]; // Restore R7:6
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -