📄 convolf.asm

📁 TMS320bbs（源程序）的c67xfiles文件。用于在CCS2.0集成编译环境下实现TI的c67x系列DSP开发。是用DSP汇编语言
💻 ASM
字号:
*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	CONVOLUTION
*
*	Revision Date: 3/13/98
*	
*	USAGE
*
*		This routine is C Callable and can be called as:
*
*
*		void convol(float *a, float *b, float *r, short nb, short nr)
*
*		a =	pointer to real input vector of size = nr+nb-1
*			a typically contains input data (x) padded with 
*			consecutive nb - 1  zeros at the beginning and end.
*		b =	pointer to real input vector of size nb in forward order. 
*			b typically contains the filter coefs (h)
*		r =	pointer to real output vector of size nr
*		nb=	number of elements in vector b. NOTE: nb <= nr  nb is 
*			typically noted as m in convol formulas. nb must be a 
*			MULTIPLE of 2
*		nr=	number of elements in vector r. nr must be a MULTIPLE of 4
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*		ARGUMENTS PASSED   ->   REGISTER USED
*		-------------------------------------
*		a                  ->   A4
*		b                  ->   B4
*		r                  ->   A6
*		nb                 ->   B6
*		nr                 ->   A8
*
*	C CODE
*
*		void convol(float *a, float *b, float *r, short nb, short nr)
*		{
*			short	ocntr, icntr;
*			float	acc ;
*
*			for (ocntr = nr ; ocntr > 0 ; ocntr--)
*			{
*				acc = 0 ;	/* zero the accumulator */
*
*				for (icntr = nb ; icntr > 0 ; icntr--)
*				{
*					acc += a[nr-ocntr+nb-icntr]*b[(icntr-1)] ;
*				}
*				r[nr-ocntr] = acc; /* Store r[0] thru r[nr-1] */
*			}
*
*		}
*
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*	DESCRIPTION
*
*		This fucntion calculates the full-lenght convolution of real 
*		vectors a and b using time-domain techniques. The result is 
*		placed in real vector r.
*
*		It is assumed that input vector is padded with nb-1 no of 
*		zeros in the beginning and end.
*
*		It is assumed that the length of the input vector b, nb, is a
*		multiple of 2 and the length of the output vector r, nr, is a	
*		multiple of 4.  The assembly routine computes 4 output samples
*		at a time. 
*		
*	TECHNIQUES
*
*		The inner loop is unrolled twice and the outer loop is 
*		unrolled four times.
*
*	ASSUMPTIONS
*
*		nb is a multiple of 2 and greater than or equal to 4
*		nr is a multiple of 4
*		Arrays a, b and r are aligned on even double-word boundary
*		
*	MEMORY NOTE
*
*		There will be a total of nr/4 memory bank hits (or once every pass
*		through the outer loop) IF nb is an integral multiple of 4 
*		(4,8,12 ...) with arrays a, b and r being aligned on even 
*		double-word boundary
*
*		There will NOT be any memory bank hits if nb IS NOT an integral 
*		multiple of 4 (6,10,14 ...) with arrays a, b and r being aligned 
*		on even double-word boundary
*
*	CYCLES
*
*		(nb/2)*nr + (nr/2)*5 + 8
*
*===============================================================================


		.global	_convol
		.text
_convol:

	STW	.D2T1		A10,*B15--(4)	; f Push A10

* BEGIN BENCHMARK TIMING

	LDDW	.D1		*A4++,A1:A0	; p @ aa1:aa0 = *a++
||	MV	.L1X		B4,A6		; f b_save = b
||	MV	.L2X		A6,B0		; f r1 = r
||	ADD	.S2		-2,B6,B1	; f lcntr = nb - 2

	LDDW	.D2		*+B4[B1],B9:B8	; p @ bb1:bb0 = *+b[lcntr]
||	MV	.L1		A4,A3		; p a_save = a

	LDDW	.D1		*A3,A9:A8	; p @ aa3:aa2 = *a_save
||	ADDAW	.D2		B4,B1,B4	; p b = b + lcntr * 4

	LDW	.D1		*+A3[2],A10	; p @ aa4 = *+a_save[2]
||	SUB	.L2		B1,2,B1		; p lcntr = lcntr - 2
||	SUB	.L1X		B4,8,A6		; p b_save = b - 8

	LDDW	.D1		*A3++,A1:A0	; p @@ aa1:aa0 = *a_save++

	LDDW	.D1T2		*A6--,B9:B8	; p @@ bb1:bb0 = *b_save--

	LDDW	.D1		*A3,A9:A8	; p @@ aa3:aa2 = *a_save
||	MPYSP	.M1X		A0,B9,A7	; p prod1 = aa0 * bb1
||	MPYSP	.M2X		A1,B8,B7	; p prod5 = aa1 * bb0
||	ADD	.L1		A4,8,A4		; p a = a + 8
||	SUB	.S1		A8,4,A2		; p ocntr = nr - 4

	LDW	.D1		*+A3[2],A10	; p @@ aa4 = *+a_save[2]
||	MPYSP	.M1X		A1,B9,A7	; p prod2 = aa1 * bb1
||	MPYSP	.M2X		A8,B8,B7	; p prod6 = aa2 * bb0

;**	
oloop:

  [B1]	LDDW	.D1		*A3++,A1:A0	; p @@@ aa1:aa0 = *a_save++
||	MPYSP	.M1X		A8,B9,A7	; p prod3 = aa2 * bb1
||	MPYSP	.M2X		A9,B8,B7	; p prod7 = aa3 * bb0
||	B	.S2		iloop		; Branch to inner loop
||	ZERO	.L1		A5		; p acc1=acc2=acc3=acc4 = 0
||	ZERO	.L2		B5		; p acc5=acc6=acc7=acc8 = 0

  [B1]	LDDW	.D1T2		*A6--,B9:B8	; p @@@ bb1:bb0 = *b_save--
||	MPYSP	.M1X		A9,B9,A7	; p prod4 = aa3 * bb1
||	MPYSP	.M2X		A10,B8,B7	; p prod8 = aa4 * bb0
||	SUB	.L2		B6,4,B2		; p icntr = nb - 4

; Kernel Loop Begins

iloop:
  [B1]	LDDW	.D1		*A3,A9:A8	; p @@@ aa3:aa2 = *a_save
||	MPYSP	.M1X		A0,B9,A7	; prod1 = aa0 * bb1
||	MPYSP	.M2X		A1,B8,B7	; prod5 = aa1 * bb0
||	ADDSP	.L1		A5,A7,A5	; acc1 = acc1 + prod1
||	ADDSP	.L2		B5,B7,B5	; acc5 = acc5 + prod5

  [B1]	LDW	.D1		*+A3[2],A10	; p @@@ aa4 = *+a_save[2]
||	MPYSP	.M1X		A1,B9,A7	; prod2 = aa1 * bb1
||	MPYSP	.M2X		A8,B8,B7	; prod6 = aa2 * bb0
||	ADDSP	.L1		A5,A7,A5	; acc2 = acc2 + prod2
||	ADDSP	.L2		B5,B7,B5	; acc6 = acc6 + prod6
||[B1]	SUB	.S2		B1,2,B1		; lcntr = lcntr - 2


  [B1]	LDDW	.D1		*A3++,A1:A0	; p @@@@ aa1:aa0 = *a_save++
||	MPYSP	.M1X		A8,B9,A7	; prod3 = aa2 * bb1
||	MPYSP	.M2X		A9,B8,B7	; prod7 = aa3 * bb0
||	ADDSP	.L1		A5,A7,A5	; acc3 = acc3 + prod3
||	ADDSP	.L2		B5,B7,B5	; acc7 = acc7 + prod7
||[B2]	B	.S2		iloop		; branch to inner loop
||[B2]	SUB	.D2		B2,2,B2		; icntr = icntr - 2

  [B1]	LDDW	.D1T2		*A6--,B9:B8	; p @@@@ bb1:bb0 = *b_save--
||	MPYSP	.M1X		A9,B9,A7	; prod4 = aa3 * bb1
||	MPYSP	.M2X		A10,B8,B7	; prod8 = aa4 * bb0
||	ADDSP	.L1		A5,A7,A5	; acc4 = acc4 + prod4 
||	ADDSP	.L2		B5,B7,B5	; acc8 = acc8 + prod8
||[!B2]	MV	.S1		A4,A3		; a_save = a

; Kernel Loop Ends 

	ADDSP	.L2X		A5,B5,B5	; o acc5 = acc1 + acc5
||[A2]	LDDW	.D1		*A3++,A1:A0	; p @ aa1:aa0 = *a_save++
||[A2]	MV	.S1X		B4,A6		; p b_save = b

	ADDSP	.L1X		A5,B5,A5	; o acc2 = acc2 + acc6
||[A2]	LDDW	.D1T2		*A6--,B9:B8	; p @ bb1:bb0 = *b_save--
||[A2]	SUB	.S2		B6,4,B1		; p lcntr = nb - 4

	ADDSP	.L2X		A5,B5,B5	; o acc7 = acc3 + acc7
||[A2]	B	.S2		oloop		; branch to outer loop
||[A2]	LDDW	.D1		*A3,A9:A8	; p @ aa3:aa2 = *a_save

	ADDSP	.L2X		A5,B5,B5	; o acc8 = acc4 + acc8
||[A2]	LDW	.D1		*+A3[2],A10	; p @ aa4 = *+a_save[2]
||	ADD	.S1		A4,8,A4		; p a = a + 8

	STW	.D2		B5,*B0++	; o *r1++ = acc5
||[A2]	LDDW	.D1		*A3++,A1:A0	; p @@ aa1:aa0 = *a_save++
||	ADD	.S1		A4,8,A4		; p a = a + 8

	STW	.D2T1		A5,*B0++	; o *r1++ = acc2
||[A2]	LDDW	.D1T2		*A6--,B9:B8	; p @@ bb1:bb0 = *b_save--

	STW	.D2		B5,*B0++	; o *r1++ = acc7
||[A2]	LDDW	.D1		*A3,A9:A8	; p @@ aa3:aa2 = *a_save
||[A2]	MPYSP	.M1X		A0,B9,A7	; p prod1 = aa0 * bb1
||[A2]	MPYSP	.M2X		A1,B8,B7	; p prod5 = aa1 * bb0

	STW	.D2		B5,*B0++	; o *r1++ = acc8
||[A2]	SUB	.L1		A2,4,A2		; o ocntr = ocntr - 4
||[A2]	LDW	.D1		*+A3[2],A10	; p @@ aa4 = *+a_save[2]
||[A2]	MPYSP	.M1X		A1,B9,A7	; p prod2 = aa1 * bb1
||[A2]	MPYSP	.M2X		A8,B8,B7	; p prod6 = aa2 * bb0

; Branch to outer loop occures here 
* END BENCHMARK TIMING
	B       .S2     	B3
||	LDW	.D2T1		*++B15(4),A10	; Pop A10
	NOP			5
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -