📄 mvm_lddw.asm

📁 TMS320bbs（源程序）的c67xfiles文件。用于在CCS2.0集成编译环境下实现TI的c67x系列DSP开发。是用DSP汇编语言
💻 ASM
字号:
*===============================================================================
*
*   TEXAS INSTRUMENTS, INC.     
*
*   MATRIX VECTOR MULTIPLY  (floating point, LDDW version)
*
*   Revision Date:  01/05/2000
*   
*   USAGE   This routine is C Callable and can be called as:
*       
*           mvmult(a, b, c, rows, columns); 
*
*       If the routine is not to be used as a C callable function,
*       then you need to initialize values for all of the parameters
*       passed to the function since these are assumed to be in
*       registers as defined by the calling convention of the
*       compiler, (refer to the TMS320C6x Optimizing C Compiler
*       User's Guide).
*
*   C CODE
*       This is the C equivalent for the assembly code.  Note that
*       the assembly code is hand optimized and restrictions may
*       apply.
*
*       void mvmult(float a[], float b[], float c[], short rows, short columns)
*       {
*           short i,j;
*           int cntr=0; 
*           float temp;
*
*           for (i=0; i<rows; i++) {
*               for (j=0; j<columns; j++) {
*               temp += a[cntr] * b[j] ;
*               cntr = cntr + 2;
*               }
*           c[i] = temp;
*           temp = 0;
*           }
*       }
*
*   DESCRIPTION
*
*       This routine calculates the product of a matrix vector multiplication
*       A[][] * B[] = C[]
*                   
*       A has dimensions m by n (rows by colums)
*       B has dimensions n by 1
*       C has dimensions m by 1
*
*   TECHNIQUES
*
*       1.  The inner loop is unrolled 10 times and software pipelined.
*       2.  The first LDDW pair of each outer loop cycle after the first outer 
*           loop cycle occurs in the outer loop after the kernel.
*           This saves one loop per cycle plus the NOP's replaced. 
*       3.  The reseting of the inner counter has been moved from the top of the
*           outer loop to the bottom of the outer loop.  This saves one cycle
*           per outer loop iteration as an MV command replaces a NOP during 
*           the adding of the running sum.  For the first cycle the inner loop 
*           counter is set outside both loop (in parallel with the outer loop
*           counter setting).
*       4.  The Epilog is removed and extraneous loads are performed.  This reduces
*           the code size.  To compensate, the aptr is reset to the correct 
*           position (subtract 72) before continuing to the next iteration.
*
*   ASSUMPTIONS
*
*       1.  Little Endian is assumed for LDDW instruction.
*       2.  The entries of matrix A should be organized into an array, float a[], 
*           where the values are place by rows;  i.e.  a[] = [row1, row2, ...]. 
*           The array a[] has (n*m) entries.
*       3.  The column dimension must be greater than or equal to 2 (i.e. m>=2)
*           and must be even.
*       4.  There is no restriction on the row dimension.  However, if the row
*           dimension is 1 the programs essentially performs a dot product.
*           A more efficient realization of the dot product algorithm is 
*           possible and avaliable.
*       
*   MEMORY NOTE
*
*       The a and b arrays should be placed on opposite double word 
*       boundaries to prevent internal data memory bank hits.       
*
*       ARGUMENTS PASSED
*
*       a[]  ->  A4
*       b[]  ->  B4
*       c[]  ->  A6
*       rows     ->  B6
*       columns  ->  A8
*
*   CYCLES
*   (m= # of rows,   n= # of columns)   
*   
*   [(n/2) + 24]*m + 7  with c overhead 
*
*===============================================================================
        .global _mvmult
        .text

aptr    .set    A4  ; 
bptr    .set    B4  ;
cptr    .set    A6  ;
rows    .set    B6  ;
colms   .set    A8  ;
aa0     .set    A10 ; save on stack
aa1     .set    A11 ; save on stack
bb0     .set    B10 ; save on stack
bb1     .set    B11 ; save on stack
sum0    .set    A9  ;
sum1    .set    B9  ;
temp0   .set    A7  ;
temp1   .set    B7  ;
temp2   .set    A2  ; 
temp3   .set    B1  ;
mult0   .set    A2  ;
mult1   .set    B1  ;
btmp    .set    B5  ;
icntr   .set    A1  ;
ocntr   .set    B2  ;
tempval .set    A3  ;
SP      .set    B15 ;

_mvmult:

*** BEGIN Benchmark Timing ***

            STW     .D2     A10,*SP--(16)       ; save A10 on stack
||          MV      .L1     B10,A7

            STW     .D2     A11,*+SP(4)         ; save A11 on stack
||          MV      .L1     SP,tempval      

*** begin piplining inner loop
    
            MV      .L2     bptr,btmp           ; move b* to btmp
||          MVK     .S1     72,A5
||          STW     .D2     B11,*+SP(12)        ; save B11 on stack
||          STW     .D1     A7,*+tempval(8)     ; save B10 on stack  

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;1  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;1  load b[i+1], b[i] from memory 
||          SUB     .S2     rows,1,ocntr        ;   set the outer counter
||          MVK     .S1     2,tempval           ;  

oloop:
  
            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;2  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;2  load b[i+1], b[i] from memory
||          ZERO    .L1     sum0                ;   zero first running sum reg.
||          ZERO    .L2     sum1                ;   zero second running sum reg.
||          SUB     .S1     colms,tempval,icntr ;  

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;3  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0      ;3  load b[i+1], b[i] from memory
    
            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;4  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;4  load b[i+1], b[i] from memory

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;5  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;5  load b[i+1], b[i] from memory
||[icntr]   SUB     .S1     icntr,2,icntr       ;5  if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;1  if(icntr) branch to iloop

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;6  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;6  load b[i+1], b[i] from memory
||          MPYSP   .M1X    aa0,bb0,mult0       ;1  mult0 = a[i]*b[i]
||          MPYSP   .M2X    aa1,bb1,mult1       ;1  mult1 = a[i+1]*b[i+1]
||[icntr]   SUB     .S1     icntr,2,icntr       ;6  if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;2  if(icntr) branch to iloop


            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;7  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;7  load b[i+1], b[i] from memory
||          MPYSP   .M1X    aa0,bb0,mult0       ;2  mult0 = a[i]*b[i]
||          MPYSP   .M2X    aa1,bb1,mult1       ;2  mult1 = a[i+1]*b[i+1]
||[icntr]   SUB     .S1     icntr,2,icntr       ;7  if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;3  if(icntr) branch to iloop

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;8  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;8  load b[i+1], b[i] from memory
||          MPYSP   .M1X    aa0,bb0,mult0       ;3  mult0 = a[i]*b[i]
||          MPYSP   .M2X    aa1,bb1,mult1       ;3  mult1 = a[i+1]*b[i+1]
||[icntr]   SUB     .S1     icntr,2,icntr       ;8  if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;4  if(icntr) branch to loop

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;9  load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;9  load b[i+1], b[i] from memory
||          MPYSP   .M1X    aa0,bb0,mult0       ;4  mult0 = a[i]*b[i]
||          MPYSP   .M2X    aa1,bb1,mult1       ;4  mult1 = a[i+1]*b[i+1]
||[icntr]   SUB     .S1     icntr,2,icntr       ;9  if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;5  if(icntr) branch to loop

**************

iloop:
            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;10 load a[i+1], a[i] from memory
||          LDDW    .D2T2   *btmp++(8),bb1:bb0  ;10 load b[i+1], b[i] from memory
||          MPYSP   .M1X    aa0,bb0,mult0       ;5  mult0 = a[i]*b[i]
||          MPYSP   .M2X    bb1,aa1,mult1       ;5  mult1 = a[i+1]*b[i+1]
||          ADDSP   .L1     mult0,sum0,sum0     ;1  sum0 = sum0+mult0
||          ADDSP   .L2     mult1,sum1,sum1     ;1  sum1 = sum1+mult1
||[icntr]   SUB     .S1     icntr,2,icntr       ;10 if(icntr) icntr -= 2
||[icntr]   B       .S2     iloop               ;6  if(icntr) branch to iloop

*** add up the running sums ***

            ADDSP   .L1X    sum0,sum1,temp0     ;   temp0 = sum0+sum1   
            ADDSP   .L2X    sum0,sum1,temp1     ;   temp1 = sum0+sum1
            ADDSP   .L1X    sum0,sum1,temp2     ;   temp2 = sum0+sum1
            ADDSP   .L2X    sum0,sum1,temp3     ;   temp3 = sum0+sum1

            SUB     .S1     aptr,A5,aptr
                            
            ADDSP   .L1X    temp0,temp1,temp0   ;   temp0 = temp0+temp1
            NOP             1                   ;   wait for temp2
            ADDSP   .L1X    temp2,temp3,temp2   ;   temp2 = temp2+temp3
            NOP             1                   ;   wait for temp0
            MV      .S2     bptr,btmp           ;   btmp = bptr
 [ocntr]    B       .S2     oloop               ;   if(ocntr) branch to oloop
            ADDSP   .L1     temp0,temp2,sum0    ;   sum0 = temp0+temp2  

            LDDW    .D1T1   *aptr++(8),aa1:aa0  ;1  load a[i+1], a[i] from memory
            LDDW    .D2T2   *btmp++(8),bb1:bb0  ;1  load b[i+1], b[i] from memory
 [ocntr]    SUB     .L2     ocntr,1,ocntr       ;   if(ocntr) ocntr -= 1
            STW     .D1     sum0,*cptr++(4)     ;   store c[i] = sum0
        
            LDW     .D2     *+SP(16),A10        ; restore A10 from stack

            LDW     .D2     *+SP(12),B11        ; restore B11 from stack        

            LDW     .D2     *+SP(8),B10         ; restore B10 from stack
||          B       .S2     B3                  ; return from function

            LDW     .D2     *+SP(4),A11         ; restore A11 from stack
    
*** END Benchmark Timing ***
                        
B_END:
            ADDAW   .D2     SP,4,SP             ; Restore SP

            NOP             3
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -