📄 convolf.asm
字号:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* CONVOLUTION
*
* Revision Date: 3/13/98
*
* USAGE
*
* This routine is C Callable and can be called as:
*
*
* void convol(float *a, float *b, float *r, short nb, short nr)
*
* a = pointer to real input vector of size = nr+nb-1
* a typically contains input data (x) padded with
* consecutive nb - 1 zeros at the beginning and end.
* b = pointer to real input vector of size nb in forward order.
* b typically contains the filter coefs (h)
* r = pointer to real output vector of size nr
* nb= number of elements in vector b. NOTE: nb <= nr nb is
* typically noted as m in convol formulas. nb must be a
* MULTIPLE of 2
* nr= number of elements in vector r. nr must be a MULTIPLE of 4
*
* If routine is not to be used as a C callable function then
* you need to initialize values for all of the values passed
* as these are assumed to be in registers as defined by the
* calling convention of the compiler, (refer to the C compiler
* reference guide).
*
* ARGUMENTS PASSED -> REGISTER USED
* -------------------------------------
* a -> A4
* b -> B4
* r -> A6
* nb -> B6
* nr -> A8
*
* C CODE
*
* void convol(float *a, float *b, float *r, short nb, short nr)
* {
* short ocntr, icntr;
* float acc ;
*
* for (ocntr = nr ; ocntr > 0 ; ocntr--)
* {
* acc = 0 ; /* zero the accumulator */
*
* for (icntr = nb ; icntr > 0 ; icntr--)
* {
* acc += a[nr-ocntr+nb-icntr]*b[(icntr-1)] ;
* }
* r[nr-ocntr] = acc; /* Store r[0] thru r[nr-1] */
* }
*
* }
*
* This is the C equivalent of the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* DESCRIPTION
*
* This fucntion calculates the full-lenght convolution of real
* vectors a and b using time-domain techniques. The result is
* placed in real vector r.
*
* It is assumed that input vector is padded with nb-1 no of
* zeros in the beginning and end.
*
* It is assumed that the length of the input vector b, nb, is a
* multiple of 2 and the length of the output vector r, nr, is a
* multiple of 4. The assembly routine computes 4 output samples
* at a time.
*
* TECHNIQUES
*
* The inner loop is unrolled twice and the outer loop is
* unrolled four times.
*
* ASSUMPTIONS
*
* nb is a multiple of 2 and greater than or equal to 4
* nr is a multiple of 4
* Arrays a, b and r are aligned on even double-word boundary
*
* MEMORY NOTE
*
* There will be a total of nr/4 memory bank hits (or once every pass
* through the outer loop) IF nb is an integral multiple of 4
* (4,8,12 ...) with arrays a, b and r being aligned on even
* double-word boundary
*
* There will NOT be any memory bank hits if nb IS NOT an integral
* multiple of 4 (6,10,14 ...) with arrays a, b and r being aligned
* on even double-word boundary
*
* CYCLES
*
* (nb/2)*nr + (nr/2)*5 + 8
*
*===============================================================================
.global _convol
.text
_convol:
STW .D2T1 A10,*B15--(4) ; f Push A10
* BEGIN BENCHMARK TIMING
LDDW .D1 *A4++,A1:A0 ; p @ aa1:aa0 = *a++
|| MV .L1X B4,A6 ; f b_save = b
|| MV .L2X A6,B0 ; f r1 = r
|| ADD .S2 -2,B6,B1 ; f lcntr = nb - 2
LDDW .D2 *+B4[B1],B9:B8 ; p @ bb1:bb0 = *+b[lcntr]
|| MV .L1 A4,A3 ; p a_save = a
LDDW .D1 *A3,A9:A8 ; p @ aa3:aa2 = *a_save
|| ADDAW .D2 B4,B1,B4 ; p b = b + lcntr * 4
LDW .D1 *+A3[2],A10 ; p @ aa4 = *+a_save[2]
|| SUB .L2 B1,2,B1 ; p lcntr = lcntr - 2
|| SUB .L1X B4,8,A6 ; p b_save = b - 8
LDDW .D1 *A3++,A1:A0 ; p @@ aa1:aa0 = *a_save++
LDDW .D1T2 *A6--,B9:B8 ; p @@ bb1:bb0 = *b_save--
LDDW .D1 *A3,A9:A8 ; p @@ aa3:aa2 = *a_save
|| MPYSP .M1X A0,B9,A7 ; p prod1 = aa0 * bb1
|| MPYSP .M2X A1,B8,B7 ; p prod5 = aa1 * bb0
|| ADD .L1 A4,8,A4 ; p a = a + 8
|| SUB .S1 A8,4,A2 ; p ocntr = nr - 4
LDW .D1 *+A3[2],A10 ; p @@ aa4 = *+a_save[2]
|| MPYSP .M1X A1,B9,A7 ; p prod2 = aa1 * bb1
|| MPYSP .M2X A8,B8,B7 ; p prod6 = aa2 * bb0
;**
oloop:
[B1] LDDW .D1 *A3++,A1:A0 ; p @@@ aa1:aa0 = *a_save++
|| MPYSP .M1X A8,B9,A7 ; p prod3 = aa2 * bb1
|| MPYSP .M2X A9,B8,B7 ; p prod7 = aa3 * bb0
|| B .S2 iloop ; Branch to inner loop
|| ZERO .L1 A5 ; p acc1=acc2=acc3=acc4 = 0
|| ZERO .L2 B5 ; p acc5=acc6=acc7=acc8 = 0
[B1] LDDW .D1T2 *A6--,B9:B8 ; p @@@ bb1:bb0 = *b_save--
|| MPYSP .M1X A9,B9,A7 ; p prod4 = aa3 * bb1
|| MPYSP .M2X A10,B8,B7 ; p prod8 = aa4 * bb0
|| SUB .L2 B6,4,B2 ; p icntr = nb - 4
; Kernel Loop Begins
iloop:
[B1] LDDW .D1 *A3,A9:A8 ; p @@@ aa3:aa2 = *a_save
|| MPYSP .M1X A0,B9,A7 ; prod1 = aa0 * bb1
|| MPYSP .M2X A1,B8,B7 ; prod5 = aa1 * bb0
|| ADDSP .L1 A5,A7,A5 ; acc1 = acc1 + prod1
|| ADDSP .L2 B5,B7,B5 ; acc5 = acc5 + prod5
[B1] LDW .D1 *+A3[2],A10 ; p @@@ aa4 = *+a_save[2]
|| MPYSP .M1X A1,B9,A7 ; prod2 = aa1 * bb1
|| MPYSP .M2X A8,B8,B7 ; prod6 = aa2 * bb0
|| ADDSP .L1 A5,A7,A5 ; acc2 = acc2 + prod2
|| ADDSP .L2 B5,B7,B5 ; acc6 = acc6 + prod6
||[B1] SUB .S2 B1,2,B1 ; lcntr = lcntr - 2
[B1] LDDW .D1 *A3++,A1:A0 ; p @@@@ aa1:aa0 = *a_save++
|| MPYSP .M1X A8,B9,A7 ; prod3 = aa2 * bb1
|| MPYSP .M2X A9,B8,B7 ; prod7 = aa3 * bb0
|| ADDSP .L1 A5,A7,A5 ; acc3 = acc3 + prod3
|| ADDSP .L2 B5,B7,B5 ; acc7 = acc7 + prod7
||[B2] B .S2 iloop ; branch to inner loop
||[B2] SUB .D2 B2,2,B2 ; icntr = icntr - 2
[B1] LDDW .D1T2 *A6--,B9:B8 ; p @@@@ bb1:bb0 = *b_save--
|| MPYSP .M1X A9,B9,A7 ; prod4 = aa3 * bb1
|| MPYSP .M2X A10,B8,B7 ; prod8 = aa4 * bb0
|| ADDSP .L1 A5,A7,A5 ; acc4 = acc4 + prod4
|| ADDSP .L2 B5,B7,B5 ; acc8 = acc8 + prod8
||[!B2] MV .S1 A4,A3 ; a_save = a
; Kernel Loop Ends
ADDSP .L2X A5,B5,B5 ; o acc5 = acc1 + acc5
||[A2] LDDW .D1 *A3++,A1:A0 ; p @ aa1:aa0 = *a_save++
||[A2] MV .S1X B4,A6 ; p b_save = b
ADDSP .L1X A5,B5,A5 ; o acc2 = acc2 + acc6
||[A2] LDDW .D1T2 *A6--,B9:B8 ; p @ bb1:bb0 = *b_save--
||[A2] SUB .S2 B6,4,B1 ; p lcntr = nb - 4
ADDSP .L2X A5,B5,B5 ; o acc7 = acc3 + acc7
||[A2] B .S2 oloop ; branch to outer loop
||[A2] LDDW .D1 *A3,A9:A8 ; p @ aa3:aa2 = *a_save
ADDSP .L2X A5,B5,B5 ; o acc8 = acc4 + acc8
||[A2] LDW .D1 *+A3[2],A10 ; p @ aa4 = *+a_save[2]
|| ADD .S1 A4,8,A4 ; p a = a + 8
STW .D2 B5,*B0++ ; o *r1++ = acc5
||[A2] LDDW .D1 *A3++,A1:A0 ; p @@ aa1:aa0 = *a_save++
|| ADD .S1 A4,8,A4 ; p a = a + 8
STW .D2T1 A5,*B0++ ; o *r1++ = acc2
||[A2] LDDW .D1T2 *A6--,B9:B8 ; p @@ bb1:bb0 = *b_save--
STW .D2 B5,*B0++ ; o *r1++ = acc7
||[A2] LDDW .D1 *A3,A9:A8 ; p @@ aa3:aa2 = *a_save
||[A2] MPYSP .M1X A0,B9,A7 ; p prod1 = aa0 * bb1
||[A2] MPYSP .M2X A1,B8,B7 ; p prod5 = aa1 * bb0
STW .D2 B5,*B0++ ; o *r1++ = acc8
||[A2] SUB .L1 A2,4,A2 ; o ocntr = ocntr - 4
||[A2] LDW .D1 *+A3[2],A10 ; p @@ aa4 = *+a_save[2]
||[A2] MPYSP .M1X A1,B9,A7 ; p prod2 = aa1 * bb1
||[A2] MPYSP .M2X A8,B8,B7 ; p prod6 = aa2 * bb0
; Branch to outer loop occures here
* END BENCHMARK TIMING
B .S2 B3
|| LDW .D2T1 *++B15(4),A10 ; Pop A10
NOP 5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -