⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 symv_u_sse.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	addps	 a3,     yy1	movsd	16 * SIZE(A2), a3	movhps	18 * SIZE(A2), a3	movaps	 xtemp2, xt1	movaps	20 * SIZE(XX), xtemp2	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	16 * SIZE(A2, LDA, 1), a4	movhps	18 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 12 * SIZE(YY)	movhps	 yy1, 14 * SIZE(YY)	movsd	16 * SIZE(YY), yy1	movhps	18 * SIZE(YY), yy1	addq	 $16 * SIZE, XX	addq	 $16 * SIZE, YY	addq	 $16 * SIZE, A1	addq	 $16 * SIZE, A2	decq	 I	jg	 .L12	ALIGN_3.L14:	testq	$8, IS	jle	.L15	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 4 * SIZE(A1), a1	movhps	 6 * SIZE(A1), a1	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 4 * SIZE(A1, LDA, 1), a2	movhps	 6 * SIZE(A1, LDA, 1), a2	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 4 * SIZE(A2), a3	movhps	 6 * SIZE(A2), a3	movaps	 xtemp1, xt1	movaps	 8 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 4 * SIZE(A2, LDA, 1), a4	movhps	 6 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhps	 6 * SIZE(YY), yy1	movaps	 xtemp2, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 8 * SIZE(A1), a1	movhps	10 * SIZE(A1), a1	movaps	 xtemp2, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 8 * SIZE(A1, LDA, 1), a2	movhps	10 * SIZE(A1, LDA, 1), a2	movaps	 xtemp2, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 8 * SIZE(A2), a3	movhps	10 * SIZE(A2), a3	movaps	 xtemp2, xt1	movaps	12 * SIZE(XX), xtemp2	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 8 * SIZE(A2, LDA, 1), a4	movhps	10 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 4 * SIZE(YY)	movhps	 yy1, 6 * SIZE(YY)	movsd	 8 * SIZE(YY), yy1	movhps	10 * SIZE(YY), yy1	addq	 $8 * SIZE, XX	addq	 $8 * SIZE, YY	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, A2	ALIGN_3.L15:	testq	$4, IS	jle	.L18	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movaps	 xtemp1, xt1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhps	 6 * SIZE(YY), yy1	addq	 $4 * SIZE, XX	addq	 $4 * SIZE, YY	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	ALIGN_3.L18:	movaps		0 * SIZE(NEW_X, IS, SIZE), atemp1	movss	 0 * SIZE(A1), a1	movss	 0 * SIZE(A1, LDA, 1), a2	movss	 0 * SIZE(A2), a3	movss	 0 * SIZE(A2, LDA, 1), a4	unpcklps a3, a1	unpcklps a4, a2	unpcklps a2, a1	mulps	 atemp1, a1	addps	 a1, xsum1	movsd	 0 * SIZE(A1, LDA, 1), a1	movss	 1 * SIZE(A2), a2	movhps	 1 * SIZE(A2, LDA, 1), a2	shufps	 $0x84, a2, a1	mulps	 atemp1, a1	addps	 a1, xsum2	movsd	 0 * SIZE(A2), a1	movss	 2 * SIZE(A2), a2	movhps	 2 * SIZE(A2, LDA, 1), a2	shufps	 $0x84, a2, a1	mulps	 atemp1, a1	addps	 a1, xsum3	movsd	 0 * SIZE(A2, LDA, 1), a1	movhps	 2 * SIZE(A2, LDA, 1), a1	mulps	 atemp1, a1	addps	 a1, xsum4#ifndef HAVE_SSE3	movaps	 xsum1,  xtemp1	unpcklps xsum3,  xsum1	unpckhps xsum3,  xtemp1	movaps	 xsum2,  xtemp2	unpcklps xsum4,  xsum2	unpckhps xsum4,  xtemp2	movaps	 xsum1,  xsum3	unpcklps xsum2,  xsum1	unpckhps xsum2,  xsum3	movaps	 xtemp1, xsum4	unpcklps xtemp2, xtemp1	unpckhps xtemp2, xsum4	addps	 xsum3,  xsum1	addps	 xtemp1, xsum4	addps	 xsum4,  xsum1#else	haddps	 xsum2, xsum1	haddps	 xsum4, xsum3	haddps	 xsum3, xsum1#endif	addps	 xsum1, yy1	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	addq	 $4, IS	movq	 IS, I	addq	 $4, I	cmpq	 M, I	jle	 .L11	ALIGN_3.L20:	testq	$2, M	jle	.L30	movq	A,  A1	leaq	(A, LDA, 2), A	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp4	pshufd	$0x00, atemp4, atemp1	pshufd	$0x55, atemp4, atemp2	pxor		xsum1, xsum1	pxor		xsum2, xsum2	movaps	 0 * SIZE(NEW_X), xtemp1	movsd	 0 * SIZE(A1), a1	movhps	 2 * SIZE(A1), a1	movsd	 0 * SIZE(A1, LDA, 1), a2	movhps	 2 * SIZE(A1, LDA, 1), a2	movsd	 0 * SIZE(NEW_Y), yy1	movhps	 2 * SIZE(NEW_Y), yy1	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	sarq	$2,  I	jle	.L28	ALIGN_3.L22:	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 4 * SIZE(A1), a1	movhps	 6 * SIZE(A1), a1	movaps	 xtemp1, xt1	movaps	 4 * SIZE(XX), xtemp1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 4 * SIZE(A1, LDA, 1), a2	movhps	 6 * SIZE(A1, LDA, 1), a2	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhps	 6 * SIZE(YY), yy1	addq	 $4 * SIZE, XX	addq	 $4 * SIZE, YY	addq	 $4 * SIZE, A1	decq	 I	jg	 .L22	ALIGN_3.L28:	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1	movss	 0 * SIZE(A1), a1	movss	 0 * SIZE(A1, LDA, 1), a2	unpcklps a2, a1	mulps	 atemp1, a1	addps	 a1, xsum1	movsd	 0 * SIZE(A1, LDA, 1), a1	mulps	 atemp1, a1	addps	 a1, xsum2#ifndef HAVE_SSE3	movhlps	 xsum1, xsum3	movhlps	 xsum2, xsum4	addps	 xsum3, xsum1	addps	 xsum4, xsum2	unpcklps xsum2, xsum1	movhlps	 xsum1, xsum2	addps	 xsum2, xsum1#else	haddps	 xsum2, xsum1	haddps	 xsum1, xsum1#endif	addps	 xsum1, yy1	movlps	 yy1, 0 * SIZE(YY)	addq	 $2, IS	ALIGN_3.L30:	testq	$1, M	jle	.L990	movq	A,  A1	movss		0 * SIZE(NEW_X, IS, SIZE), atemp1	pshufd	$0x00, atemp1, atemp1	pxor		xsum1, xsum1	pxor		xsum2, xsum2	movss	 0 * SIZE(NEW_Y), yy1	movss	 0 * SIZE(NEW_X), xtemp1	movss	 1 * SIZE(NEW_X), xtemp2	movss	 0 * SIZE(A1), a1	movss	 1 * SIZE(A1), a2	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	sarq	$1,  I	jle	.L38	ALIGN_3.L32:	movaps	 xtemp1, xt1	movss	 2 * SIZE(XX), xtemp1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movss	 2 * SIZE(A1), a1	movss	 yy1, 0 * SIZE(YY)	movss	 1 * SIZE(YY), yy1	movaps	 xtemp2, xt1	movss	 3 * SIZE(XX), xtemp2	mulps	 a2,     xt1	mulps	 atemp1, a2	addps	 xt1,    xsum1	addps	 a2,     yy1	movss	 3 * SIZE(A1), a2	movss	 yy1, 1 * SIZE(YY)	movss	 2 * SIZE(YY), yy1	addq	 $2 * SIZE, XX	addq	 $2 * SIZE, YY	addq	 $2 * SIZE, A1	decq	 I	jg	 .L32	ALIGN_3.L38:	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1	movss	 0 * SIZE(A1), a1	mulss	 atemp1, a1	addss	 a1, xsum1#ifndef HAVE_SSE3	movhlps	 xsum1, xsum3	movhlps	 xsum2, xsum4	addps	 xsum3, xsum1	addps	 xsum4, xsum2	unpcklps xsum2, xsum1	movhlps	 xsum1, xsum2	addps	 xsum2, xsum1#else	addss	 xsum2, xsum1#endif	addss	 xsum1, yy1	movss	 yy1, 0 * SIZE(YY)	addq	 $2, IS	ALIGN_3.L990:	cmpq   $SIZE, INCY	je    .L999	movq	M,  %rax	sarq	$3, %rax	jle	.L997	ALIGN_3.L996:	movss	 0 * SIZE(NEW_Y), %xmm0	movss	 1 * SIZE(NEW_Y), %xmm1	movss	 2 * SIZE(NEW_Y), %xmm2	movss	 3 * SIZE(NEW_Y), %xmm3	movss	 4 * SIZE(NEW_Y), %xmm4	movss	 5 * SIZE(NEW_Y), %xmm5	movss	 6 * SIZE(NEW_Y), %xmm6	movss	 7 * SIZE(NEW_Y), %xmm7	movss	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm2,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm3,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm4,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm5,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm6,  0 * SIZE(Y)	addq	INCY, Y	movss	%xmm7,  0 * SIZE(Y)	addq	INCY, Y	addq	$8 * SIZE, NEW_Y	decq	%rax	jg	.L996	ALIGN_3.L997:	movq	M, %rax	andq	$7, %rax	jle	.L999	ALIGN_3.L998:	movss	0 * SIZE(NEW_Y), %xmm0	movss	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	addq	$1 * SIZE, NEW_Y	decq	%rax	jg	.L998	ALIGN_3.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15#ifdef WINDOWS_ABI	movq	 48(%rsp), %rdi	movq	 56(%rsp), %rsi	movups	 64(%rsp), %xmm6	movups	 80(%rsp), %xmm7	movups	 96(%rsp), %xmm8	movups	112(%rsp), %xmm9	movups	128(%rsp), %xmm10	movups	144(%rsp), %xmm11	movups	160(%rsp), %xmm12	movups	176(%rsp), %xmm13	movups	192(%rsp), %xmm14	movups	208(%rsp), %xmm15#endif	addq	$STACKSIZE, %rsp	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -