📄 sumsq.asm

📁 TMS320bbs（源程序）的c67xfiles文件。用于在CCS2.0集成编译环境下实现TI的c67x系列DSP开发。是用DSP汇编语言
💻 ASM
字号:
*===============================================================================**	TEXAS INSTRUMENTS, INC.		**	VECTOR SUM OF SQUARES  (floating point)**	Revision Date:	02/18/98*	*	USAGE	This routine is C Callable and can be called as:*		*		float vecsumsq(float *x, short count);**		x is pointer to array holding the floating point vector*		count is the number of values in the x vector**		If the routine is not to be used as a C callable function,*		then you need to initialize values for all of the parameters*		passed to the function since these are assumed to be in*		registers as defined by the calling convention of the*		compiler, (refer to the TMS320C6x Optimizing C Compiler*		User's Guide).**	C CODE*		This is the C equivalent for the assembly code.  Note that*		the assembly code is hand optimized and restrictions may*		apply.**		float vecsumsq(float x[], int n)*		{*			int i;*			float sum=0;*			for(i=0; i<n; i++)*			{*				sum += x[i]*x[i];*			}*			return(sum);*		}**	DESCRIPTION**		This routine calculates the sum of squares of a vector.**	TECHNIQUES**		1.  Two LDW instructions are used to simultaneously load*		    x[i] and x[i+1]	*		2.  The loop is unrolled once and software pipelined.*		3.  The loop is primed to reduce code size and ease the array *			size restriction*	ASSUMPTIONS**		1.  Little Endian is assumed for LDW instructions.*		2.  The value of count (and the number of entries in the *		    array x) must be greater than or equal to 12 and *		    even (i.e. 12, 14, 16, ...).*		3.  Since single assignment of registers is not used,*		    interrupts should be disabled before this function is *                   called.*		*	MEMORY NOTE**		This routine performs extraneous loads since*		it has no epilog.  The epilog can be added by changing*		the first instruction as described below and by removing*		the (*) from the epilog lines.**       ARGUMENTS PASSED**		x	 ->  A4*		count	 ->  B4**	CYCLES**	N/2 + 25**===============================================================================	.global _vecsumsq	.text_vecsumsq:*** BEGIN Benchmark Timing ***;	MVK	.S2	22,B0		; B0 = 22 (if Epilog used)  	MVK	.S2	4,B0		; B0 = 4 (if Epilog NOT used)||	ADD	.L2X	A4,4,B8		; make B8 pointer that is x[i+1]					;      A4 is x[i];** --------------------------------------------------------------------------*; PIPED LOOP PROLOG (primed)	LDW	.D1T1	*A4++[2],A7	;5	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory	||	B	.S1	LOOP		;	if(cntr) branch to loop||	ZERO	.L1	A5		;	sum1 = 0||	ZERO	.L2	B5		;	sum2 = 0||	SUB	.S2	B4,B0,B2	;	LDW	.D1T1	*A4++[2],A7	;6	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory||	B	.S1	LOOP		;	if(cntr) branch to loop||	ZERO	.L1	A0		;	prod1 = 0||	ZERO	.L2	B0		;	prod2 = 0	LDW	.D1T1	*A4++[2],A7	;7	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory||	B	.S1	LOOP		;	if(cntr) branch to loop	LDW	.D1T1	*A4++[2],A7	;8	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory||	B	.S1	LOOP		;	if(cntr) branch to loop                                    	LDW	.D1T1	*A4++[2],A7	;9	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory||	B	.S1	LOOP		;	if(cntr) branch to loop                                  ;** --------------------------------------------------------------------------*LOOP:     ; PIPED LOOP KERNEL	LDW	.D1T1	*A4++[2],A7	;10	load x[i] from memory||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory||	MPYSP	.M1	A7,A7,A0	;	prod1 = x[i]*x[i]||	MPYSP	.M2	B7,B7,B0	;	prod2 = x[i+1]*x[i+1]||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2|| [B2] B	.S1	LOOP		; 	if(cntr) branch to loop|| [B2]	SUB	.S2	B2,2,B2		;	if(cntr)  cntr = cntr - 2;** --------------------------------------------------------------------------* ; PIPED LOOP EPILOG  (not used)                                                  	**	MPYSP	.M1	A7,A7,A0	;11	prod1 = x[i]*x[i]*||	MPYSP	.M2	B7,B7,B0	;	prod2 = x[i+1]*x[i+1]*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5     	;	sum2 = sum2 + prod2**	MPYSP	.M1	A7,A7,A0	;12	prod1 = x[i]*x[i]*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2**	MPYSP	.M1	A7,A7,A0	;13	prod1 = x[i]*x[i]*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2**	MPYSP	.M1	A7,A7,A0	;14	prod1 = x[i]*x[i]*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2**	MPYSP	.M1	A7,A7,A0	;15	prod1 = x[i]*x[i]*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2**	ADDSP	.L1	A0,A5,A5	;16	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2*    *	ADDSP	.L1	A0,A5,A5	;17	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2*   *	ADDSP	.L1	A0,A5,A5	;18	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2*              *	ADDSP	.L1	A0,A5,A5	;19	sum1 = sum1 + prod1*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2*** --------------------------------------------------------------------------*           	ADDSP	.L1X	A5,B5,A0	;20	A0 = sum1 + sum2	ADDSP	.L2X	A5,B5,B0	;21	B0 = sum1 + sum2	ADDSP	.L1X	A5,B5,A0	;22	A0 = sum1 + sum2	ADDSP	.L2X	A5,B5,B0	;23	B0 = sum1 + sum2	NOP		1		;	wait for B0	ADDSP	.L1X	A0,B0,A5	;	A5 = A0 + B0	NOP		1		;	wait for second B0	ADDSP	.L2X	A0,B0,B5	;	B5 = A0 + B0	NOP		1		;	nop	B 	.S2	B3		;	return from function	NOP		1		;	wait for B5	ADDSP	.L1X	A5,B5,A4	;	return (A5 + B5)	NOP		3		;	wait for A4 and branchB_END:                                 	*** END Benchmark Timing ***
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -