symv_l_sse.s

来自「Optimized GotoBLAS libraries」· S 代码 · 共 993 行 · 第 1/2 页

S
993
字号
	movsd	12 * SIZE(A2), a3	movhps	14 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN)	PREFETCHW	PREFETCHSIZE(YY)#endif	movaps	 xtemp1, xt1	movaps	16 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	12 * SIZE(A2, LDA, 1), a4	movhps	14 * SIZE(A2, LDA, 1), a4	movlps	 yy1,  8 * SIZE(YY)	movhps	 yy1, 10 * SIZE(YY)	movsd	12 * SIZE(YY), yy1	movhps	14 * SIZE(YY), yy1	movaps	 xtemp2, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	16 * SIZE(A1), a1	movhps	18 * SIZE(A1), a1	PREFETCH	PREFETCHSIZE(A2, LDA, 1)	movaps	 xtemp2, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	16 * SIZE(A1, LDA, 1), a2	movhps	18 * SIZE(A1, LDA, 1), a2	movaps	 xtemp2, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	16 * SIZE(A2), a3	movhps	18 * SIZE(A2), a3	movaps	 xtemp2, xt1	movaps	20 * SIZE(XX), xtemp2	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	16 * SIZE(A2, LDA, 1), a4	movhps	18 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 12 * SIZE(YY)	movhps	 yy1, 14 * SIZE(YY)	movsd	16 * SIZE(YY), yy1	movhps	18 * SIZE(YY), yy1	addq	 $16 * SIZE, XX	addq	 $16 * SIZE, YY	addq	 $16 * SIZE, A1	addq	 $16 * SIZE, A2	decq	 I	jg	 .L12	ALIGN_3.L14:	movq	M,  I	subq	IS, I	subq	$4, I	test	$8, I	jle	.L15	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 4 * SIZE(A1), a1	movhps	 6 * SIZE(A1), a1	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 4 * SIZE(A1, LDA, 1), a2	movhps	 6 * SIZE(A1, LDA, 1), a2	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 4 * SIZE(A2), a3	movhps	 6 * SIZE(A2), a3	movaps	 xtemp1, xt1	movaps	 8 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 4 * SIZE(A2, LDA, 1), a4	movhps	 6 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhps	 6 * SIZE(YY), yy1	movaps	 xtemp2, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 8 * SIZE(A1), a1	movhps	10 * SIZE(A1), a1	movaps	 xtemp2, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 8 * SIZE(A1, LDA, 1), a2	movhps	10 * SIZE(A1, LDA, 1), a2	movaps	 xtemp2, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 8 * SIZE(A2), a3	movhps	10 * SIZE(A2), a3	movaps	 xtemp2, xt1	movaps	12 * SIZE(XX), xtemp2	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 8 * SIZE(A2, LDA, 1), a4	movhps	10 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 4 * SIZE(YY)	movhps	 yy1, 6 * SIZE(YY)	movsd	 8 * SIZE(YY), yy1	movhps	10 * SIZE(YY), yy1	addq	 $8 * SIZE, XX	addq	 $8 * SIZE, YY	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, A2	ALIGN_3.L15:	test	$4, I	jle	.L17	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 4 * SIZE(A1), a1	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 4 * SIZE(A1, LDA, 1), a2	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 4 * SIZE(A2), a3	movaps	 xtemp1, xt1	movsd	 4 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 4 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	addq	 $4 * SIZE, XX	addq	 $4 * SIZE, YY	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	ALIGN_3.L17:	testq	$2, M	jle	.L18	pxor	 xtemp2, xtemp2	movlhps  xtemp2, a1	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movss	 2 * SIZE(A1), a1	movlhps  xtemp2, a2	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movss	 2 * SIZE(A1, LDA, 1), a2	movlhps  xtemp2, a3	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movss	 2 * SIZE(A2), a3	movlhps  xtemp2, a4	movaps	 xtemp1, xt1	movss	 2 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movss	 2 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 0 * SIZE(YY)	movss	 2 * SIZE(YY), yy1	addq	 $2 * SIZE, XX	addq	 $2 * SIZE, YY	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, A2	ALIGN_3.L18:	testq	$1, M	jle	.L19	movss	 0 * SIZE(XX), xtemp1	movss	 0 * SIZE(YY), yy1	movss	 0 * SIZE(A1), a1	movss	 0 * SIZE(A1, LDA, 1), a2	movss	 0 * SIZE(A2), a3	movss	 0 * SIZE(A2, LDA, 1), a4	movaps	 xtemp1, xt1	mulss	 a1,     xt1	mulss	 atemp1, a1	addss	 xt1,    xsum1	addss	 a1,     yy1	movaps	 xtemp1, xt1	mulss	 a2,     xt1	mulss	 atemp2, a2	addss	 xt1,    xsum2	addss	 a2,     yy1	movaps	 xtemp1, xt1	mulss	 a3,     xt1	mulss	 atemp3, a3	addss	 xt1,    xsum3	addss	 a3,     yy1	movaps	 xtemp1, xt1	mulss	 a4,     xt1	mulss	 atemp4, a4	addss	 xt1,    xsum4	addss	 a4,     yy1	movss	 yy1, 0 * SIZE(YY)	ALIGN_3.L19:#ifndef HAVE_SSE3	movaps	 xsum1,  xtemp1	unpcklps xsum3,  xsum1	unpckhps xsum3,  xtemp1	movaps	 xsum2,  xtemp2	unpcklps xsum4,  xsum2	unpckhps xsum4,  xtemp2	movaps	 xsum1,  xsum3	unpcklps xsum2,  xsum1	unpckhps xsum2,  xsum3	movaps	 xtemp1, xsum4	unpcklps xtemp2, xtemp1	unpckhps xtemp2, xsum4	addps	 xsum3,  xsum1	addps	 xtemp1, xsum4	addps	 xsum4,  xsum1#else	haddps	 xsum2, xsum1	haddps	 xsum4, xsum3	haddps	 xsum3, xsum1#endif	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1	movhps	 2 * SIZE(NEW_Y, IS, SIZE), yy1	addps	 xsum1, yy1	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)	movhps	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)	addq	 $4, IS	movq	 IS, I	addq	 $4, I	cmpq	 M, I	jle	 .L11	ALIGN_3.L20:	testq	$2, M	jle	.L30	movq	A,  A1	leaq	2 * SIZE(A, LDA, 2), A	movaps	 0 * SIZE(NEW_X, IS, SIZE), atemp4#if defined(OPTERON)	pxor	xsum1, xsum1#endif	movsd	 0 * SIZE(A1), xsum1	mulps	 atemp4, xsum1	movss	 1 * SIZE(A1), xsum2	movss	 1 * SIZE(A1, LDA, 1), a2	unpcklps a2, xsum2	mulps	 atemp4, xsum2	pshufd	$0x00, atemp4, atemp1	pshufd	$0x55, atemp4, atemp2	testq	$1, M	jle	.L29	movss	 2 * SIZE(A1), a1	movss	 2 * SIZE(A1, LDA, 1), a2	movss	 2 * SIZE(NEW_X, IS, SIZE), xtemp1	movss	 2 * SIZE(NEW_Y, IS, SIZE), yy1	movaps	 xtemp1, xt1	mulss	 a1,     xt1	mulss	 atemp1, a1	addss	 xt1,    xsum1	addps	 a1,     yy1	movaps	 xtemp1, xt1	mulss	 a2,     xt1	mulss	 atemp2, a2	addss	 xt1,    xsum2	addss	 a2,     yy1	movss	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)	ALIGN_3.L29:#ifndef HAVE_SSE3	unpcklps xsum2, xsum1	movhlps	 xsum1, xsum2	addps	 xsum2, xsum1#else	haddps	 xsum2, xsum1	haddps	 xsum1, xsum1#endif	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1	addps	 xsum1, yy1	movlps	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)	addq	 $2, IS	ALIGN_3.L30:	testq	$1, M	jle	.L990	movss	 0 * SIZE(NEW_X, IS, SIZE), xsum1	mulss	 0 * SIZE(A), xsum1	addss	 0 * SIZE(NEW_Y, IS, SIZE), xsum1	movss	 xsum1, 0 * SIZE(NEW_Y, IS, SIZE)	ALIGN_3.L990:	cmpq   $SIZE, INCY	je    .L999	movq	M,  %rax	sarq	$3, %rax	jle	.L997	ALIGN_3.L996:	movss	 0 * SIZE(NEW_Y), %xmm0	movss	 1 * SIZE(NEW_Y), %xmm1	movss	 2 * SIZE(NEW_Y), %xmm2	movss	 3 * SIZE(NEW_Y), %xmm3	movss	 4 * SIZE(NEW_Y), %xmm4	movss	 5 * SIZE(NEW_Y), %xmm5	movss	 6 * SIZE(NEW_Y), %xmm6	movss	 7 * SIZE(NEW_Y), %xmm7	movss	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm2,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm3,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm4,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm5,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm6,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm7,  0 * SIZE(Y)	addq	INCY, Y	addq	$8 * SIZE, NEW_Y	decq	%rax	jg	.L996	ALIGN_3.L997:	movq	M, %rax	andq	$7, %rax	jle	.L999	ALIGN_3.L998:	movss	0 * SIZE(NEW_Y), %xmm0	movss	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	addq	$1 * SIZE, NEW_Y	decq	%rax	jg	.L998	ALIGN_3.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15#ifdef WINDOWS_ABI	movq	 48(%rsp), %rdi	movq	 56(%rsp), %rsi	movups	 64(%rsp), %xmm6	movups	 80(%rsp), %xmm7	movups	 96(%rsp), %xmm8	movups	112(%rsp), %xmm9	movups	128(%rsp), %xmm10	movups	144(%rsp), %xmm11	movups	160(%rsp), %xmm12	movups	176(%rsp), %xmm13	movups	192(%rsp), %xmm14	movups	208(%rsp), %xmm15#endif	addq	$STACKSIZE, %rsp	ret	EPILOGUE

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?