📄 dotprod.asm

📁 davinci技术源码视频监控汇编源码
💻 ASM
字号:
* ========================================================================= *
*  TEXAS INSTRUMENTS, INC.                                                  *
*                                                                           *
*  NAME                                                                     *
*      dotprod -- dotprod                                                   *
*                                                                           *
*                                                                           *
*  REVISION DATE                                                            *
*      24-May-2005                                                          *
*                                                                           *
*   USAGE                                                                   *
*                                                                           *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*       int dotprod                                                         *
*       (                                                                   *
*           const short *m,       // Pointer to first vector  //            *
*           const short *n,       // Pointer to second vector //            *
*           int          count    // Length of vectors.       //            *
*       );                                                                  *
*                                                                           *
*       This routine returns the dot product as its return value.           *
*                                                                           *
*                                                                           *
*   DESCRIPTION                                                             *
*                                                                           *
*       The "dotprod" function implements a dot product of two input        *
*       vectors, returning the scalar result.  Each element of the          *
*       first array is multiplied with the corresponding element of the     *
*       second array, and the products are summed.  The sum is returned.    *
*                                                                           *
*       int dotprod                                                         *
*       (                                                                   *
*           const short *m,       // Pointer to first vector  //            *
*           const short *n,       // Pointer to second vector //            *
*           int          count    // Length of vectors.       //            *
*       )                                                                   *
*       {                                                                   *
*           int i, sum = 0;                                                 *
*                                                                           *
*           for (i = 0; i < count; i++)                                     *
*               sum += m[i] * n[i];                                         *
*                                                                           *
*           return sum;                                                     *
*       }                                                                   *
*                                                                           *
*       The above C code is a general implementation without                *
*       restrictions.  The assembly code has some restrictions, as          *
*       noted below.                                                        *
*                                                                           *
*                                                                           *
*   TECHNIQUES                                                              *
*                                                                           *
*       The code is unrolled 4 times to enable full memory and multiplier   *
*       bandwidth to be utilized.                                           *
*                                                                           *
*       One cycle for a XP stall exists to add A and B side sums.           *
*                                                                           *
*                                                                           *
*   ASSUMPTIONS                                                             *
*                                                                           *
*       The input length is a multiple of 4 and greater than 0.             *
*                                                                           *
*       The input data and coeeficients are stored on double word           *
*       aligned boundaries.                                                 *
*                                                                           *
*                                                                           *
*   MEMORY NOTE                                                             *
*                                                                           *
*       To avoid bank conflicts, The input arrays 'm' and 'n' must          *
*       be offset by 4 half-words (8 bytes).                                *
*                                                                           *
*       The code is ENDIAN NEUTRAL.                                         *
*                                                                           *
*                                                                           *
*   CYCLES                                                                  *
*                                                                           *
*       cycles = count/4 + 14                                               *
*       For count = 256, cycles = 78                                        *
*                                                                           *
*   CODESIZE                                                                *
*                                                                           *
*       64 bytes                                                            *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2005 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


* ======================================================================== *
* ======================================================================== *
**********************= SYMBOLIC REGISTER ASSIGNMENTS ************************
        .asg    A4,   A_m     ; pointer to vector m
        .asg    B4,   B_n     ; pointer to vector n
        .asg    B21,  B_count ; number of elements in each vector
        .asg    A19,  A_sum   ; partial sum a
        .asg    A18,  A_prod  ; sum of products a[i]*b[i]+a[i+1]*b[i+1]
        .asg    B19,  B_sum   ; partial sum b
        .asg    B18,  B_prod  ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3]
        .asg    A17,  A_reg1  ; elements a[i+3] a[i+2]
        .asg    A16,  A_reg0  ; elements a[i+1] a[i]
        .asg    B17,  B_reg1  ; elements b[i+3] b[i+2]
        .asg    B16,  B_reg0  ; elements b[i+1] b[i]
        .asg    A4 ,  A_sumt  ; total sum a + b returned to caller

        .text        .global _dotprod_dotprod:
* ======================================================================== *
* ======================================================================== *
        SHR   .S2X    A6,        2,        B_count     ; n/4

        SUB   .L2     B_count,   4,        B_count     ; n/4-4

        SPLOOPD 1
||      MVC   .S2     B_count, ILC
||      ZERO  .L1     A_sum                            ;
||      ZERO  .L2     B_sum                            ; sum's = 0
*----------------------------------------------------------------------------*
        LDDW  .D2T2   *B_n++,     B_reg1:B_reg0        ; load b[i+3,i]
||      LDDW  .D1T1   *A_m++,     A_reg1:A_reg0        ; load a[i+3,i]

        NOP   4

        DOTP2 .M2X    A_reg0,     B_reg0,     B_prod   ; a[0]*b[0]+a[1]*b[1]
||      DOTP2 .M1X    A_reg1,     B_reg1,     A_prod   ; a[2]*b[2]+a[3]*b[3]

        NOP   3

        SPKERNEL 4, 0
||      ADD   .L2     B_sum,      B_prod,     B_sum    ; sum += productb
||      ADD   .L1     A_sum,      A_prod,     A_sum    ; sum += producta
*----------------------------------------------------------------------------*

        BNOP  .S2     B3, 4

*---- Epilogue complete -----------------------------------------------------*
        ADD   .L1X    A_sum,      B_sum,      A_sumt   ; final sum

*---- Branch occurs ---------------------------------------------------------*
                .end

* ======================================================================== *
*  End of file: dotprod.asm                                                *
* ------------------------------------------------------------------------ *
*          Copyright (C) 2005 Texas Instruments, Incorporated.             *
*                          All Rights Reserved.                            *
* ======================================================================== *
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -