📄 autocorf.asm
字号:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* AUTOCORRELATION
*
* Revision Date: 4/30/98
*
* USAGE
*
* This routine is C Callable and can be called as:
*
* void autocor( float *acl, const float *inp, int M, int N)
*
* acl[] --- Resulting array of autocorrelation
* inp[] --- Input array of autocorrelation
* M --- Length of autocorrelation vector (MULTIPLE of 4)
* N --- { Length of Input array (acl[]) vector - M }
* (MULTIPLE of 2)
*
* If routine is not to be used as a C callable function then
* you need to initialize values for all of the values passed
* as these are assumed to be in registers as defined by the
* calling convention of the compiler, (refer to the C compiler
* reference guide).
*
* ARGUMENTS PASSED -> REGISTER
* ---------------------------------
* acl -> A4
* inp -> B4
* M -> A6
* N -> B6
*
* C CODE
*
* This is the C equivalent of the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* void autocor( float *acl, const float *inp, int M, int N)
* {
* int i,k;
* float sum;
*
* for (i = 0; i < M; i++)
* {
* sum = 0;
* for (k = M; k < N+M; k++)
* {
* sum += inp[k] * inp[k-i];
* }
* acl[i] = sum ;
* }
* }
*
* DESCRIPTION
*
* This routine performs the autocorrelation of the input array inp.
* It is assumed that the length of the input array, inp, is a
* multiple of 2 and the length of the output array, acl, is a
* multiple of 4. The assembly routine performs 4 output samples
* at a time.
*
* TECHNIQUES
*
* The inner loop is unrolled twice. The length of
* the input array must be a multiple of 2. The outer
* loop is unrolled four times so the length of output array must
* be a multiple of 4.
*
* The outer loop is conditionally executed in parallel with the
* inner loop. This allows for a zero overhead outer loop.
*
* ASSUMPTIONS
*
* N is a multiple of 2 and greater then 4
* M is a multiple of 4 and greater than 4
* inp is aligned on even doubleword boundary
* acl is offset by a word from inp alignment
* inp is assumed to be padded with M zeros starting from location 0
*
* MEMORY NOTE
*
* No Memory bank hits if inp and acl alignment assumptions apply
*
* CYCLES
*
* (N/2)*M + (M/2)*5 + 9
*
* NOTATIONS
*
* f = Function Prolog or Epilog
* o = Outer Loop
* p = Inner Loop Prolog
*
*================================================================================
.global _autocor
.text
_autocor:
; BEGIN BENCHMARK TIMING
STW .D2 B10,*B15--(4) ; f push B10
|| MV .L1X B4,A3 ; f temp = inp
|| ADDAW .D1 A4,A6,A4 ; f acl = &acl[M]
|| ADD .L2 B4,8,B4 ; f inp += 8
;*-----------------------------------------------------------------------------*
LDDW .D2 *B4++,B1:B0 ; p @ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
|| ADDAW .D1 A3,A6,A0 ; f inp0 = &inp[M]
LDDW .D1 *A0++,A9:A8 ; p @ (kk1:kk0) inp[k+1]:inp[k]
|| MV .L1X B4,A7 ; f inp1 = inp
|| MV .S1 A0,A3 ; f temp = inp0
LDW .D1T2 *-A7(12),B7 ; p @ (k1) inp[k-(i-1)] (or *-inp1[3] )
|| SUB .S1 A6,4,A1 ; f (outer loop counter) cntr1 = M - 4
LDDW .D1T2 *A7,B9:B8 ; p @ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
LDDW .D1T2 *A7++,B1:B0 ; p @@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
LDDW .D1 *A0++,A9:A8 ; p @@ (kk1:kk0) inp[k+1]:inp[k]
|| SUB .L1X B6,4,A2 ; p (inner loop counter) cntr2 = N - 4
|| SUB .L2 B6,4,B6 ; f N = N - 4
LDW .D1T2 *-A7(12),B7 ; p @@ (k1) inp[k-(i-1)] (or *-inp1[3] )
|| MPYSP .M1X A8,B0,A5 ; p prod2 = inp[k]*inp[k-(i-2)]
|| MPYSP .M2X A9,B1,B5 ; p prod6 = inp[k+1]*inp[k-(i-3)]
LDDW .D1T2 *A7,B9:B8 ; p @@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
|| MPYSP .M1X A8,B7,A5 ; p prod1 = inp[k]*inp[k-(i-1)]
|| MPYSP .M2X A9,B0,B5 ; p prod5 = inp[k+1]*inp[k-(i-2)]
|| SHRU .S2 B6,1,B2 ; p (load counter) lcntr = (N - 4)/2
LOOP1: ; OUTER LOOP
[B2] LDDW .D1T2 *A7++,B1:B0 ; @@@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
|| MPYSP .M1X A8,B1,A5 ; prod3 = inp[k]*inp[k-(i-3)]
|| MPYSP .M2X A9,B8,B5 ; prod7 = inp[k+1]*inp[k-(i-4)]
|| B .S2 LOOP2 ; Branch to inner loop
[B2] LDDW .D1 *A0++,A9:A8 ; @@@ (kk1:kk0) inp[k+1]:inp[k]
|| MPYSP .M1X A8,B8,A5 ; prod4 = inp[k]*inp[k-(i-4)]
|| MPYSP .M2X A9,B9,B5 ; prod8 = inp[k+1]*inp[k-(i-5)]
|| ZERO .L1 A6 ; sum1 = sum2 = sum3 = sum4 = 0
|| ZERO .L2 B10 ; sum5 = sum6 = sum7 = sum8 = 0
|| ADD .D2 8,B4,B4 ; inp = inp + 8
;*-----------------------------------------------------------------*
LOOP2: ; KERNEL
[B2] LDW .D1T2 *-A7(12),B7 ; @@@ (k1) inp[k-(i-1)] (or *-inp1[3] )
|| MPYSP .M1X A8,B0,A5 ; prod2 = inp[k]*inp[k-(i-2)]
|| MPYSP .M2X A9,B1,B5 ; prod6 = inp[k+1]*inp[k-(i-3)]
|| ADDSP .L1 A6,A5,A6 ; sum2 = sum2 + prod2
|| ADDSP .L2 B10,B5,B10 ; sum6 = sum6 + prod6
[B2] LDDW .D1T2 *A7,B9:B8 ; @@@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
|| MPYSP .M1X A8,B7,A5 ; prod1 = inp[k]*inp[k-(i-1)]
|| MPYSP .M2X A9,B0,B5 ; prod5 = inp[k+1]*inp[k-(i-2)]
|| ADDSP .L1 A6,A5,A6 ; sum1 = sum1 + prod1
|| ADDSP .L2 B10,B5,B10 ; sum5 = sum5 + prod5
||[B2] SUB .S2 B2,1,B2 ; lcntr = lcntr - 1
[B2] LDDW .D1T2 *A7++,B1:B0 ; @@@@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
|| MPYSP .M1X A8,B1,A5 ; prod3 = inp[k]*inp[k-(i-3)]
|| MPYSP .M2X A9,B8,B5 ; prod7 = inp[k+1]*inp[k-(i-4)]
|| ADDSP .L1 A6,A5,A6 ; sum3 = sum3 + prod3
|| ADDSP .L2 B10,B5,B10 ; sum7 = sum7 + prod7
||[A2] SUB .S1 A2,2,A2 ; cntr2 = cntr2 - 2
||[A2] B .S2 LOOP2
[B2] LDDW .D1 *A0++,A9:A8 ; @@@@ (kk1:kk0) inp[k+1]:inp[k]
|| MPYSP .M1X A8,B8,A5 ; prod4 = inp[k]*inp[k-(i-4)]
|| MPYSP .M2X A9,B9,B5 ; prod8 = inp[k+1]*inp[k-(i-5)]
|| ADDSP .L1 A6,A5,A6 ; sum4 = sum4 + prod4
|| ADDSP .L2 B10,B5,B10 ; sum8 = sum8 + prod8
;*-----------------------------------------------------------------*
ADDSP .L1 A6,B10,A6 ; o sum2 = sum2 + sum6
|| MV .S1 A3,A0 ; p inp0 = temp ( = &inp[M])
||[A1] LDDW .D2 *B4++,B1:B0 ; p @ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
ADDSP .L2X A6,B10,B10 ; o sum5 = sum1 + sum5
||[A1] LDDW .D1 *A0++,A9:A8 ; p @ (kk1:kk0) inp[k+1]:inp[k]
||[A1] MV .S1X B4,A7 ; p inp1 = inp
ADDSP .L1 A6,B10,A6 ; o sum3 = sum3 + sum7
||[A1] B .S2 LOOP1 ; o Branch to outer loop
||[A1] LDW .D1T2 *-A7(12),B7 ; p @ (k1) inp[k-(i-1)] (or *-inp1[3] )
ADDSP .L1 A6,B10,A6 ; o sum4 = sum4 + sum8
|| MV .S2X A4,B2 ; o lcntr = acl
|| SUB .S1 A4,16,A4 ; o acl = acl - 16
||[A1] LDDW .D1T2 *A7,B9:B8 ; p @ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
;*
STW .D2T1 A6,*--B2[2] ; o acl[i-2] = sum2
||[A1] LDDW .D1T2 *A7++,B1:B0 ; p @@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
STW .D2 B10,*++B2[1] ; o acl[i-1] = sum5
||[A1] LDDW .D1 *A0++,A9:A8 ; p @@ (kk1:kk0) inp[k+1]:inp[k]
||[A1] MV .S1X B6,A2 ; p cntr2 = N
STW .D2T1 A6,*--B2[2] ; o acl[i-3] = sum3
||[A1] LDW .D1T2 *-A7(12),B7 ; p @@ (k1) inp[k-(i-1)] (or *-inp1[3] )
||[A1] MPYSP .M1X A8,B0,A5 ; p prod2 = inp[k]*inp[k-(i-2)]
||[A1] MPYSP .M2X A9,B1,B5 ; p prod6 = inp[k+1]*inp[k-(i-3)]
STW .D2T1 A6,*-B2[1] ; o acl[i-4] = sum4
||[A1] SUB .L1 A1,4,A1 ; o cntr1 = cntr1 - 4
||[A1] LDDW .D1T2 *A7,B9:B8 ; p @@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
||[A1] MPYSP .M1X A8,B7,A5 ; p prod1 = inp[k]*inp[k-(i-1)]
||[A1] MPYSP .M2X A9,B0,B5 ; p prod5 = inp[k+1]*inp[k-(i-2)]
||[A1] SHRU .S2 B6,1,B2 ; p (load counter) lcntr = (N - 4)/2
;**
; BRANCH TO OUTER LOOP OCCURS
; END OF BENCHMARK TIMING
;*------------------------------------------------------------------*
B .S2 B3
LDW .D2 *++B15(4),B10 ; pop B10
NOP 4
; BRANCH TO CALLING FUNCTION OCCURS
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -