📄 croscorf.asm
字号:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* CROSS-CORRELATION
*
* Revision Date: 3/13/98
*
* USAGE
*
* This routine is C Callable and can be called as:
*
* void crosscor(float *a, float *b, float *r, short nb, short nr)
*
* a = pointer to real input vector of size = nr+nb-1
* a typically contains input data (x) padded with
* consecutive nb - 1 zeros at the beginning and end.
* b = pointer to real input vector of size nb in forward order.
* b typically contains the filter coefs (h)
* r = pointer to real output vector of size nr
* nb= number of elements in vector b. NOTE: nb <= nr nb is
* typically noted as m in convol formulas. nb must be a
* MULTIPLE of 2
* nr= number of elements in vector r. nr must be a MULTIPLE of 4
*
* If routine is not to be used as a C callable function then
* you need to initialize values for all of the values passed
* as these are assumed to be in registers as defined by the
* calling convention of the compiler, (refer to the C compiler
* reference guide).
*
* ARGUMENTS PASSED -> REGISTER USED
* -------------------------------------
* a -> A4
* b -> B4
* r -> A6
* nb -> B6
* nr -> A8
*
* C CODE
*
* void crosscor(float *a, float *b, float *r, short nb, short nr)
* {
* short i, j;
* float accum;
*
* for (i = 0; i < nr ; i++)
* {
* acc = 0 ;
*
* for (j = 0; j < nb; j++)
* accum += b[j] * a[i+j];
*
* r[i] = acc ;
* }
* }
*
* This is the C equivalent of the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* DESCRIPTION
*
* This fucntion calculates the full-lenght biased cross-correlation
* of real vectors a and b using time-domain techniques. The result
* is placed in real vector r.
*
* TECHNIQUES
*
* The inner loop is unrolled twice and software pipelined and the
* outer loop is unrolled 4 times.
*
* Input vector a is assumed to be padded with zeros which provides
* symmetry and allows unrolling of inner loop.
*
* This routine computes biased (raw) cross-correlation.
*
* Registers are shared by variables whenever possible to save on
* register usage.
*
* ASSUMPTIONS
*
* nb is a multiple of 2 and greater than or equal to 2 (2,4,6,...)
* nr is a multiple of 4 and greater than or equal to 4 (4,8,12,...)
*
* It is assumed that input vector is padded with nb-1 no of consecutive
* zeros in the beginning and end.
*
* Arrays a,b and r are aligned on the same (odd/even) word boundary
*
* MEMORY NOTE
*
* No memory bank hits if array alignment restrictions hold
*
* CYCLES
*
* (nb/2)*nr + (nr/2)*5 + 8
*
*================================================================================
.global _crosscor
.text
_crosscor:
STW .D2T1 A10,*B15--(4)
* BEGIN BENCHMARK TIMING
LDDW .D1 *A4++,A1:A0 ; p @ aa1:aa0 = *a++
|| MV .L1X B4,A6 ; f b_save = b
|| MV .L2X A6,B0 ; f r1 = r
LDDW .D1T2 *A6++,B9:B8 ; p @ bb1:bb0 = *b_save++
|| MV .L1 A4,A3 ; p a_save = a
LDDW .D1 *A3,A9:A8 ; p @ aa3:aa2 = *a_save
LDW .D1 *+A3[2],A10 ; p @ aa4 = *+a_save[2]
LDDW .D1 *A3++,A1:A0 ; p @@ aa1:aa0 = *a_save++
LDDW .D1T2 *A6++,B9:B8 ; p @@ bb1:bb0 = *b_save++
LDDW .D1 *A3,A9:A8 ; p @@ aa3:aa2 = *a_save
|| MPYSP .M1X A0,B8,A7 ; p prod1 = aa0 * bb0
|| MPYSP .M2X A1,B9,B7 ; p prod5 = aa1 * bb1
|| ADD .L1 A4,8,A4 ; p a = a + 8
|| SUB .S1 A8,4,A2 ; f ocntr = nr - 4
LDW .D1 *+A3[2],A10 ; p @@ aa4 = *+a_save[2]
|| MPYSP .M1X A1,B8,A7 ; p prod2 = aa1 * bb0
|| MPYSP .M2X A8,B9,B7 ; p prod6 = aa2 * bb1
|| SUB .L2 B6,4,B1 ; f lcntr = nb - 4
;**
oloop:
[B1] LDDW .D1 *A3++,A1:A0 ; p @@@ aa1:aa0 = *a_save++
|| MPYSP .M1X A8,B8,A7 ; p prod3 = aa2 * bb0
|| MPYSP .M2X A9,B9,B7 ; p prod7 = aa3 * bb1
|| B .S2 iloop ; p Branch to inner loop
|| ZERO .L1 A5 ; p acc1=acc2=acc3=acc4 = 0
|| ZERO .L2 B5 ; p acc5=acc6=acc7=acc8 = 0
[B1] LDDW .D1T2 *A6++,B9:B8 ; p @@@ bb1:bb0 = *b_save++
|| MPYSP .M1X A9,B8,A7 ; p prod4 = aa3 * bb0
|| MPYSP .M2X A10,B9,B7 ; p prod8 = aa4 * bb1
|| SUB .L2 B6,4,B2 ; p icntr = nb - 4
; Kernel Loop Begins
iloop:
[B1] LDDW .D1 *A3,A9:A8 ; @@@ aa3:aa2 = *a_save
|| MPYSP .M1X A0,B8,A7 ; prod1 = aa0 * bb0
|| MPYSP .M2X A1,B9,B7 ; prod5 = aa1 * bb1
|| ADDSP .L1 A5,A7,A5 ; acc1 = acc1 + prod1
|| ADDSP .L2 B5,B7,B5 ; acc5 = acc5 + prod5
[B1] LDW .D1 *+A3[2],A10 ; @@@ aa4 = *+a_save[2]
|| MPYSP .M1X A1,B8,A7 ; prod2 = aa1 * bb0
|| MPYSP .M2X A8,B9,B7 ; prod6 = aa2 * bb1
|| ADDSP .L1 A5,A7,A5 ; acc2 = acc2 + prod2
|| ADDSP .L2 B5,B7,B5 ; acc6 = acc6 + prod6
||[B1] SUB .S2 B1,2,B1 ; lcntr = lcntr - 2
[B1] LDDW .D1 *A3++,A1:A0 ; @@@@ aa1:aa0 = *a_save++
|| MPYSP .M1X A8,B8,A7 ; prod3 = aa2 * bb0
|| MPYSP .M2X A9,B9,B7 ; prod7 = aa3 * bb1
|| ADDSP .L1 A5,A7,A5 ; acc3 = acc3 + prod3
|| ADDSP .L2 B5,B7,B5 ; acc7 = acc7 + prod7
||[B2] B .S1 iloop ; branch to inner loop
||[B2] SUB .S2 B2,2,B2 ; icntr = icntr - 2
[B1] LDDW .D1T2 *A6++,B9:B8 ; @@@@ bb1:bb0 = *b_save++
|| MPYSP .M1X A9,B8,A7 ; prod4 = aa3 * bb0
|| MPYSP .M2X A10,B9,B7 ; prod8 = aa4 * bb1
|| ADDSP .L1 A5,A7,A5 ; acc4 = acc4 + prod4
|| ADDSP .L2 B5,B7,B5 ; acc8 = acc8 + prod8
||[!B2] MV .S1 A4,A3 ; a_save = a
; Kernel Loop Ends
ADDSP .L2X A5,B5,B5 ; o acc5 = acc1 + acc5
||[A2] LDDW .D1 *A3++,A1:A0 ; p @ aa1:aa0 = *a_save++
||[A2] MV .S1X B4,A6 ; p b_save = b
ADDSP .L1X A5,B5,A5 ; o acc2 = acc2 + acc6
||[A2] LDDW .D1T2 *A6++,B9:B8 ; p @ bb1:bb0 = *b_save++
||[A2] SUB .S2 B6,4,B1 ; p lcntr = nb - 4
ADDSP .L2X A5,B5,B5 ; o acc7 = acc3 + acc7
||[A2] B .S2 oloop ; branch to outer loop
||[A2] LDDW .D1 *A3,A9:A8 ; p @ aa3:aa2 = *a_save
ADDSP .L2X A5,B5,B5 ; o acc8 = acc4 + acc8
||[A2] LDW .D1 *+A3[2],A10 ; p @ aa4 = *+a_save[2]
|| ADD .S1 A4,8,A4 ; p a = a + 8
STW .D2 B5,*B0++ ; o *r1++ = acc5
||[A2] LDDW .D1 *A3++,A1:A0 ; p @@ aa1:aa0 = *a_save++
|| ADD .S1 A4,8,A4 ; p a = a + 8
STW .D2T1 A5,*B0++ ; o *r1++ = acc2
||[A2] LDDW .D1T2 *A6++,B9:B8 ; p @@ bb1:bb0 = *b_save++
STW .D2 B5,*B0++ ; o *r1++ = acc7
||[A2] LDDW .D1 *A3,A9:A8 ; p @@ aa3:aa2 = *a_save
||[A2] MPYSP .M1X A0,B8,A7 ; p prod1 = aa0 * bb0
||[A2] MPYSP .M2X A1,B9,B7 ; p prod5 = aa1 * bb1
STW .D2 B5,*B0++ ; o *r1++ = acc8
||[A2] SUB .L1 A2,4,A2 ; o ocntr = ocntr - 4
||[A2] LDW .D1 *+A3[2],A10 ; p @@ aa4 = *+a_save[2]
||[A2] MPYSP .M1X A1,B8,A7 ; p prod2 = aa1 * bb0
||[A2] MPYSP .M2X A8,B9,B7 ; p prod6 = aa2 * bb1
; Branch to outer loop occures here
* END BENCHMARK TIMING
B .S2 B3
|| LDW .D2T1 *++B15(4),A10 ; Pop A10
NOP 5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -