📄 mvm_lddw.asm
字号:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* MATRIX VECTOR MULTIPLY (floating point, LDDW version)
*
* Revision Date: 01/05/2000
*
* USAGE This routine is C Callable and can be called as:
*
* mvmult(a, b, c, rows, columns);
*
* If the routine is not to be used as a C callable function,
* then you need to initialize values for all of the parameters
* passed to the function since these are assumed to be in
* registers as defined by the calling convention of the
* compiler, (refer to the TMS320C6x Optimizing C Compiler
* User's Guide).
*
* C CODE
* This is the C equivalent for the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* void mvmult(float a[], float b[], float c[], short rows, short columns)
* {
* short i,j;
* int cntr=0;
* float temp;
*
* for (i=0; i<rows; i++) {
* for (j=0; j<columns; j++) {
* temp += a[cntr] * b[j] ;
* cntr = cntr + 2;
* }
* c[i] = temp;
* temp = 0;
* }
* }
*
* DESCRIPTION
*
* This routine calculates the product of a matrix vector multiplication
* A[][] * B[] = C[]
*
* A has dimensions m by n (rows by colums)
* B has dimensions n by 1
* C has dimensions m by 1
*
* TECHNIQUES
*
* 1. The inner loop is unrolled 10 times and software pipelined.
* 2. The first LDDW pair of each outer loop cycle after the first outer
* loop cycle occurs in the outer loop after the kernel.
* This saves one loop per cycle plus the NOP's replaced.
* 3. The reseting of the inner counter has been moved from the top of the
* outer loop to the bottom of the outer loop. This saves one cycle
* per outer loop iteration as an MV command replaces a NOP during
* the adding of the running sum. For the first cycle the inner loop
* counter is set outside both loop (in parallel with the outer loop
* counter setting).
* 4. The Epilog is removed and extraneous loads are performed. This reduces
* the code size. To compensate, the aptr is reset to the correct
* position (subtract 72) before continuing to the next iteration.
*
* ASSUMPTIONS
*
* 1. Little Endian is assumed for LDDW instruction.
* 2. The entries of matrix A should be organized into an array, float a[],
* where the values are place by rows; i.e. a[] = [row1, row2, ...].
* The array a[] has (n*m) entries.
* 3. The column dimension must be greater than or equal to 2 (i.e. m>=2)
* and must be even.
* 4. There is no restriction on the row dimension. However, if the row
* dimension is 1 the programs essentially performs a dot product.
* A more efficient realization of the dot product algorithm is
* possible and avaliable.
*
* MEMORY NOTE
*
* The a and b arrays should be placed on opposite double word
* boundaries to prevent internal data memory bank hits.
*
* ARGUMENTS PASSED
*
* a[] -> A4
* b[] -> B4
* c[] -> A6
* rows -> B6
* columns -> A8
*
* CYCLES
* (m= # of rows, n= # of columns)
*
* [(n/2) + 24]*m + 7 with c overhead
*
*===============================================================================
.global _mvmult
.text
aptr .set A4 ;
bptr .set B4 ;
cptr .set A6 ;
rows .set B6 ;
colms .set A8 ;
aa0 .set A10 ; save on stack
aa1 .set A11 ; save on stack
bb0 .set B10 ; save on stack
bb1 .set B11 ; save on stack
sum0 .set A9 ;
sum1 .set B9 ;
temp0 .set A7 ;
temp1 .set B7 ;
temp2 .set A2 ;
temp3 .set B1 ;
mult0 .set A2 ;
mult1 .set B1 ;
btmp .set B5 ;
icntr .set A1 ;
ocntr .set B2 ;
tempval .set A3 ;
SP .set B15 ;
_mvmult:
*** BEGIN Benchmark Timing ***
STW .D2 A10,*SP--(16) ; save A10 on stack
|| MV .L1 B10,A7
STW .D2 A11,*+SP(4) ; save A11 on stack
|| MV .L1 SP,tempval
*** begin piplining inner loop
MV .L2 bptr,btmp ; move b* to btmp
|| MVK .S1 72,A5
|| STW .D2 B11,*+SP(12) ; save B11 on stack
|| STW .D1 A7,*+tempval(8) ; save B10 on stack
LDDW .D1T1 *aptr++(8),aa1:aa0 ;1 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;1 load b[i+1], b[i] from memory
|| SUB .S2 rows,1,ocntr ; set the outer counter
|| MVK .S1 2,tempval ;
oloop:
LDDW .D1T1 *aptr++(8),aa1:aa0 ;2 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;2 load b[i+1], b[i] from memory
|| ZERO .L1 sum0 ; zero first running sum reg.
|| ZERO .L2 sum1 ; zero second running sum reg.
|| SUB .S1 colms,tempval,icntr ;
LDDW .D1T1 *aptr++(8),aa1:aa0 ;3 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;3 load b[i+1], b[i] from memory
LDDW .D1T1 *aptr++(8),aa1:aa0 ;4 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;4 load b[i+1], b[i] from memory
LDDW .D1T1 *aptr++(8),aa1:aa0 ;5 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;5 load b[i+1], b[i] from memory
||[icntr] SUB .S1 icntr,2,icntr ;5 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;1 if(icntr) branch to iloop
LDDW .D1T1 *aptr++(8),aa1:aa0 ;6 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;6 load b[i+1], b[i] from memory
|| MPYSP .M1X aa0,bb0,mult0 ;1 mult0 = a[i]*b[i]
|| MPYSP .M2X aa1,bb1,mult1 ;1 mult1 = a[i+1]*b[i+1]
||[icntr] SUB .S1 icntr,2,icntr ;6 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;2 if(icntr) branch to iloop
LDDW .D1T1 *aptr++(8),aa1:aa0 ;7 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;7 load b[i+1], b[i] from memory
|| MPYSP .M1X aa0,bb0,mult0 ;2 mult0 = a[i]*b[i]
|| MPYSP .M2X aa1,bb1,mult1 ;2 mult1 = a[i+1]*b[i+1]
||[icntr] SUB .S1 icntr,2,icntr ;7 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;3 if(icntr) branch to iloop
LDDW .D1T1 *aptr++(8),aa1:aa0 ;8 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;8 load b[i+1], b[i] from memory
|| MPYSP .M1X aa0,bb0,mult0 ;3 mult0 = a[i]*b[i]
|| MPYSP .M2X aa1,bb1,mult1 ;3 mult1 = a[i+1]*b[i+1]
||[icntr] SUB .S1 icntr,2,icntr ;8 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;4 if(icntr) branch to loop
LDDW .D1T1 *aptr++(8),aa1:aa0 ;9 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;9 load b[i+1], b[i] from memory
|| MPYSP .M1X aa0,bb0,mult0 ;4 mult0 = a[i]*b[i]
|| MPYSP .M2X aa1,bb1,mult1 ;4 mult1 = a[i+1]*b[i+1]
||[icntr] SUB .S1 icntr,2,icntr ;9 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;5 if(icntr) branch to loop
**************
iloop:
LDDW .D1T1 *aptr++(8),aa1:aa0 ;10 load a[i+1], a[i] from memory
|| LDDW .D2T2 *btmp++(8),bb1:bb0 ;10 load b[i+1], b[i] from memory
|| MPYSP .M1X aa0,bb0,mult0 ;5 mult0 = a[i]*b[i]
|| MPYSP .M2X bb1,aa1,mult1 ;5 mult1 = a[i+1]*b[i+1]
|| ADDSP .L1 mult0,sum0,sum0 ;1 sum0 = sum0+mult0
|| ADDSP .L2 mult1,sum1,sum1 ;1 sum1 = sum1+mult1
||[icntr] SUB .S1 icntr,2,icntr ;10 if(icntr) icntr -= 2
||[icntr] B .S2 iloop ;6 if(icntr) branch to iloop
*** add up the running sums ***
ADDSP .L1X sum0,sum1,temp0 ; temp0 = sum0+sum1
ADDSP .L2X sum0,sum1,temp1 ; temp1 = sum0+sum1
ADDSP .L1X sum0,sum1,temp2 ; temp2 = sum0+sum1
ADDSP .L2X sum0,sum1,temp3 ; temp3 = sum0+sum1
SUB .S1 aptr,A5,aptr
ADDSP .L1X temp0,temp1,temp0 ; temp0 = temp0+temp1
NOP 1 ; wait for temp2
ADDSP .L1X temp2,temp3,temp2 ; temp2 = temp2+temp3
NOP 1 ; wait for temp0
MV .S2 bptr,btmp ; btmp = bptr
[ocntr] B .S2 oloop ; if(ocntr) branch to oloop
ADDSP .L1 temp0,temp2,sum0 ; sum0 = temp0+temp2
LDDW .D1T1 *aptr++(8),aa1:aa0 ;1 load a[i+1], a[i] from memory
LDDW .D2T2 *btmp++(8),bb1:bb0 ;1 load b[i+1], b[i] from memory
[ocntr] SUB .L2 ocntr,1,ocntr ; if(ocntr) ocntr -= 1
STW .D1 sum0,*cptr++(4) ; store c[i] = sum0
LDW .D2 *+SP(16),A10 ; restore A10 from stack
LDW .D2 *+SP(12),B11 ; restore B11 from stack
LDW .D2 *+SP(8),B10 ; restore B10 from stack
|| B .S2 B3 ; return from function
LDW .D2 *+SP(4),A11 ; restore A11 from stack
*** END Benchmark Timing ***
B_END:
ADDAW .D2 SP,4,SP ; Restore SP
NOP 3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -