⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 symv_u_sse2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	mulpd	 atemp2, a3	addpd	 xt1,    xsum2	addpd	 a3,     yy2	movsd	 4 * SIZE(A2, LDA, 1), a3	movhpd	 5 * SIZE(A2, LDA, 1), a3	movapd	 xtemp1, xt1	mulpd	 a1,     xt1	mulpd	 atemp3, a1	addpd	 xt1,    xsum3	addpd	 a1,     yy1	movsd	 6 * SIZE(A2, LDA, 1), a1	movhpd	 7 * SIZE(A2, LDA, 1), a1	PREFETCH	PREFETCHSIZE(A2, LDA, 1)	movapd	 xtemp2, xt1	mulpd	 a2,     xt1	mulpd	 atemp3, a2	addpd	 xt1,    xsum3	addpd	 a2,     yy2	movsd	10 * SIZE(A1), a2	movhpd	11 * SIZE(A1), a2	movapd	 xtemp1, xt1	movapd	 8 * SIZE(XX), xtemp1	mulpd	 a3,     xt1	mulpd	 atemp4, a3	addpd	 xt1,    xsum4	addpd	 a3,     yy1	movsd	 8 * SIZE(A1, LDA, 1), a3	movhpd	 9 * SIZE(A1, LDA, 1), a3	movapd	 xtemp2, xt1	movapd	10 * SIZE(XX), xtemp2	mulpd	 a1,     xt1	mulpd	 atemp4, a1	addpd	 xt1,    xsum4	addpd	 a1,     yy2	movsd	 8 * SIZE(A1), a1	movhpd	 9 * SIZE(A1), a1	movsd	 yy1, 4 * SIZE(YY)	movhpd	 yy1, 5 * SIZE(YY)	movsd	 8 * SIZE(YY), yy1	movhpd	 9 * SIZE(YY), yy1	movsd	 yy2, 6 * SIZE(YY)	movhpd	 yy2, 7 * SIZE(YY)	movsd	10 * SIZE(YY), yy2	movhpd	11 * SIZE(YY), yy2	addq	 $8 * SIZE, XX	addq	 $8 * SIZE, YY	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, A2	decq	 I	jg	 .L12	ALIGN_3.L15:	testq	$4, IS	jle	.L18	movapd	 xtemp1, xt1	mulpd	 a1,     xt1	mulpd	 atemp1, a1	addpd	 xt1,    xsum1	addpd	 a1,     yy1	movsd	 2 * SIZE(A1, LDA, 1), a1	movhpd	 3 * SIZE(A1, LDA, 1), a1	movapd	 xtemp2, xt1	mulpd	 a2,     xt1	mulpd	 atemp1, a2	addpd	 xt1,    xsum1	addpd	 a2,     yy2	movsd	 0 * SIZE(A2), a2	movhpd	 1 * SIZE(A2), a2	movapd	 xtemp1, xt1	mulpd	 a3,     xt1	mulpd	 atemp2, a3	addpd	 xt1,    xsum2	addpd	 a3,     yy1	movsd	 2 * SIZE(A2), a3	movhpd	 3 * SIZE(A2), a3	movapd	 xtemp2, xt1	mulpd	 a1,     xt1	mulpd	 atemp2, a1	addpd	 xt1,    xsum2	addpd	 a1,     yy2	movsd	 0 * SIZE(A2, LDA, 1), a1	movhpd	 1 * SIZE(A2, LDA, 1), a1	movapd	 xtemp1, xt1	mulpd	 a2,     xt1	mulpd	 atemp3, a2	addpd	 xt1,    xsum3	addpd	 a2,     yy1	movsd	 2 * SIZE(A2, LDA, 1), a2	movhpd	 3 * SIZE(A2, LDA, 1), a2	movapd	 xtemp2, xt1	mulpd	 a3,     xt1	mulpd	 atemp3, a3	addpd	 xt1,    xsum3	addpd	 a3,     yy2	movapd	 xtemp1, xt1	movapd	 4 * SIZE(XX), xtemp1	mulpd	 a1,     xt1	mulpd	 atemp4, a1	addpd	 xt1,    xsum4	addpd	 a1,     yy1	movapd	 xtemp2, xt1	movapd	 6 * SIZE(XX), xtemp2	mulpd	 a2,     xt1	mulpd	 atemp4, a2	addpd	 xt1,    xsum4	addpd	 a2,     yy2	movsd	 yy1, 0 * SIZE(YY)	movhpd	 yy1, 1 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhpd	 5 * SIZE(YY), yy1	movsd	 yy2, 2 * SIZE(YY)	movhpd	 yy2, 3 * SIZE(YY)	movsd	 6 * SIZE(YY), yy2	movhpd	 7 * SIZE(YY), yy2	addq	 $4 * SIZE, XX	addq	 $4 * SIZE, YY	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	ALIGN_3.L18:	unpckhpd atemp2, atemp1	unpckhpd atemp4, atemp3	movsd	 0 * SIZE(A1), a1	movhpd	 0 * SIZE(A1, LDA, 1), a1	mulpd	 atemp1, a1	addpd	 a1, xsum1	movsd	 0 * SIZE(A1, LDA, 1), a1	movhpd	 1 * SIZE(A1, LDA, 1), a1	mulpd	 atemp1, a1	addpd	 a1, xsum2	movsd	 0 * SIZE(A2), a1	movhpd	 1 * SIZE(A2), a1	mulpd	 atemp1, a1	addpd	 a1, xsum3	movsd	 0 * SIZE(A2, LDA, 1), a1	movhpd	 1 * SIZE(A2, LDA, 1), a1	mulpd	 atemp1, a1	addpd	 a1, xsum4	movsd	 0 * SIZE(A2), a1	movhpd	 0 * SIZE(A2, LDA, 1), a1	mulpd	 atemp3, a1	addpd	 a1, xsum1	movsd	 1 * SIZE(A2), a1	movhpd	 1 * SIZE(A2, LDA, 1), a1	mulpd	 atemp3, a1	addpd	 a1, xsum2	movsd	 2 * SIZE(A2), a1	movhpd	 2 * SIZE(A2, LDA, 1), a1	mulpd	 atemp3, a1	addpd	 a1, xsum3	movsd	 2 * SIZE(A2, LDA, 1), a1	movhpd	 3 * SIZE(A2, LDA, 1), a1	mulpd	 atemp3, a1	addpd	 a1, xsum4#ifndef HAVE_SSE3	movapd	xsum1, atemp1	movapd	xsum3, atemp3	unpcklpd xsum2, xsum1	unpcklpd xsum4, xsum3	unpckhpd xsum2, atemp1	unpckhpd xsum4, atemp3	addpd	 atemp1, xsum1	addpd	 atemp3, xsum3#else	haddpd	 xsum2, xsum1	haddpd	 xsum4, xsum3#endif	addpd	 xsum1, yy1	addpd	 xsum3, yy2	movsd	 yy1, 0 * SIZE(YY)	movhpd	 yy1, 1 * SIZE(YY)	movsd	 yy2, 2 * SIZE(YY)	movhpd	 yy2, 3 * SIZE(YY)	addq	 $4, IS	movq	 IS, I	addq	 $4, I	cmpq	 M, I	jle	 .L11	ALIGN_3.L20:	testq	$2, M	je	.L30	ALIGN_3.L21:	movq	A,  A1	leaq	(A, LDA, 2), A#ifdef HAVE_SSE3	movddup		0 * SIZE(NEW_X, IS, SIZE), atemp1	movddup		1 * SIZE(NEW_X, IS, SIZE), atemp2#else	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1	movhpd		0 * SIZE(NEW_X, IS, SIZE), atemp1	movsd		1 * SIZE(NEW_X, IS, SIZE), atemp2	movhpd		1 * SIZE(NEW_X, IS, SIZE), atemp2#endif	pxor		xsum1, xsum1	pxor		xsum2, xsum2	movapd	 0 * SIZE(NEW_X), xtemp1	movsd	 0 * SIZE(NEW_Y), yy1	movhpd	 1 * SIZE(NEW_Y), yy1	movsd	 0 * SIZE(A1), a1	movhpd	 1 * SIZE(A1), a1	movsd	 0 * SIZE(A1, LDA, 1), a2	movhpd	 1 * SIZE(A1, LDA, 1), a2	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	sarq	$1,  I	jle	.L28	ALIGN_3.L22:	movapd	 xtemp1, xt1	mulpd	 a1,     xt1	mulpd	 atemp1, a1	addpd	 xt1,    xsum1	addpd	 a1,     yy1	movsd	 2 * SIZE(A1), a1	movhpd	 3 * SIZE(A1), a1	movapd	 xtemp1, xt1	movapd	 2 * SIZE(XX), xtemp1	mulpd	 a2,     xt1	mulpd	 atemp2, a2	addpd	 xt1,    xsum2	addpd	 a2,     yy1	movsd	 2 * SIZE(A1, LDA, 1), a2	movhpd	 3 * SIZE(A1, LDA, 1), a2	movsd	 yy1, 0 * SIZE(YY)	movhpd	 yy1, 1 * SIZE(YY)	movsd	 2 * SIZE(YY), yy1	movhpd	 3 * SIZE(YY), yy1	addq	 $2 * SIZE, XX	addq	 $2 * SIZE, YY	addq	 $2 * SIZE, A1	decq	 I	jg	 .L22	ALIGN_3.L28:	unpckhpd atemp2, atemp1	movsd	 0 * SIZE(A1), a1	movhpd	 0 * SIZE(A1, LDA, 1), a1	mulpd	 atemp1, a1	addpd	 a1, xsum1	movsd	 0 * SIZE(A1, LDA, 1), a1	movhpd	 1 * SIZE(A1, LDA, 1), a1	mulpd	 atemp1, a1	addpd	 a1, xsum2#ifndef HAVE_SSE3	movapd	xsum1, atemp1	unpcklpd xsum2, xsum1	unpckhpd xsum2, atemp1	addpd	 atemp1, xsum1#else	haddpd	 xsum2, xsum1#endif	addpd	 xsum1, yy1	movsd	 yy1, 0 * SIZE(YY)	movhpd	 yy1, 1 * SIZE(YY)	addq	 $2, IS	ALIGN_3.L30:	testq	$1, M	je	.L990	ALIGN_3.L31:	movq	A,  A1#ifdef HAVE_SSE3	movddup		0 * SIZE(NEW_X, IS, SIZE), atemp1#else	movsd		0 * SIZE(NEW_X, IS, SIZE), atemp1	movhpd		0 * SIZE(NEW_X, IS, SIZE), atemp1#endif	pxor		xsum1, xsum1	movsd	 0 * SIZE(NEW_X), xtemp1	movsd	 0 * SIZE(NEW_Y), yy1	movsd	 0 * SIZE(A1), a1	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	testq	I,  I	jle	.L38	ALIGN_3.L32:	movapd	 xtemp1, xt1	mulpd	 a1,     xt1	mulpd	 atemp1, a1	addpd	 xt1,    xsum1	addpd	 a1,     yy1	movsd	 1 * SIZE(A1), a1	movsd	 1 * SIZE(XX), xtemp1	movsd	 yy1, 0 * SIZE(YY)	movsd	 1 * SIZE(YY), yy1	addq	 $1 * SIZE, XX	addq	 $1 * SIZE, YY	addq	 $1 * SIZE, A1	decq	 I	jg	 .L32	ALIGN_3.L38:	movsd	 0 * SIZE(A1), a1	mulsd	 atemp1, a1	addsd	 a1, xsum1	movsd	 0 * SIZE(A1, LDA, 1), a1	mulsd	 atemp1, a1	addsd	 a1, xsum2	addsd	 xsum1, yy1	movsd	 yy1, 0 * SIZE(YY)	ALIGN_3.L990:	cmpq   $SIZE, INCY	je    .L999	movq	M,  %rax	sarq	$3, %rax	jle	.L997	ALIGN_3.L996:	movapd	 0 * SIZE(NEW_Y), %xmm0	movapd	 2 * SIZE(NEW_Y), %xmm1	movapd	 4 * SIZE(NEW_Y), %xmm2	movapd	 6 * SIZE(NEW_Y), %xmm3	movsd	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm2,  0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm2,  0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm3,  0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm3,  0 * SIZE(Y)	addq	INCY, Y	addq	$8 * SIZE, NEW_Y	decq	%rax	jg	.L996	ALIGN_3.L997:	movq	M, %rax	andq	$7, %rax	jle	.L999	ALIGN_3.L998:	movsd	0 * SIZE(NEW_Y), %xmm0	movsd	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	addq	$1 * SIZE, NEW_Y	decq	%rax	jg	.L998	ALIGN_3.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15#ifdef WINDOWS_ABI	movq	 48(%rsp), %rdi	movq	 56(%rsp), %rsi	movups	 64(%rsp), %xmm6	movups	 80(%rsp), %xmm7	movups	 96(%rsp), %xmm8	movups	112(%rsp), %xmm9	movups	128(%rsp), %xmm10	movups	144(%rsp), %xmm11	movups	160(%rsp), %xmm12	movups	176(%rsp), %xmm13	movups	192(%rsp), %xmm14	movups	208(%rsp), %xmm15#endif	addq	$STACKSIZE, %rsp	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -