⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dot_sse_core2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	movsd	-32 * SIZE(Y), %xmm8	mulps	%xmm8,  %xmm4	addps	%xmm4,  %xmm3	addq	$2 * SIZE, X	addq	$2 * SIZE, Y	ALIGN_3.L27:	testq	$1, N	jle	.L999	ALIGN_3	movss	-32 * SIZE(X), %xmm4	mulss	-32 * SIZE(Y), %xmm4	addss	%xmm4,  %xmm0	jmp	.L999	ALIGN_3.L30:	testq	$1 * SIZE, X	jne	.L40	movaps	 -34 * SIZE(X), %xmm4	movq	N,  %rax	sarq	$5, %rax	jle	.L33	ALIGN_4.L31:	movaps	 -30 * SIZE(X), %xmm5	SHUFPD_1 %xmm5, %xmm4	movaps	 -26 * SIZE(X), %xmm6	SHUFPD_1 %xmm6, %xmm5	movaps	 -22 * SIZE(X), %xmm7	SHUFPD_1 %xmm7, %xmm6	movaps	 -18 * SIZE(X), %xmm8	SHUFPD_1 %xmm8, %xmm7	mulps	-32 * SIZE(Y), %xmm4	addps	%xmm4, %xmm0	mulps	-28 * SIZE(Y), %xmm5	addps	%xmm5, %xmm1	mulps	-24 * SIZE(Y), %xmm6	addps	%xmm6, %xmm2	mulps	-20 * SIZE(Y), %xmm7	addps	%xmm7, %xmm3	movaps	 -14 * SIZE(X), %xmm9	SHUFPD_1 %xmm9,  %xmm8	movaps	 -10 * SIZE(X), %xmm10	SHUFPD_1 %xmm10, %xmm9	movaps	  -6 * SIZE(X), %xmm11	SHUFPD_1 %xmm11, %xmm10	movaps	  -2 * SIZE(X), %xmm4	subq	$-32 * SIZE, X	SHUFPD_1 %xmm4,  %xmm11	mulps	-16 * SIZE(Y), %xmm8	addps	%xmm8,  %xmm0	mulps	-12 * SIZE(Y), %xmm9	addps	%xmm9,  %xmm1	mulps	 -8 * SIZE(Y), %xmm10	addps	%xmm10, %xmm2	mulps	 -4 * SIZE(Y), %xmm11	subq	$-32 * SIZE, Y	addps	%xmm11, %xmm3	subq	$1, %rax	jg,pt	.L31	ALIGN_3.L33:	testq	$16, N	jle	.L34	ALIGN_3	movaps	-30 * SIZE(X), %xmm5	movaps	-26 * SIZE(X), %xmm6	movaps	-22 * SIZE(X), %xmm7	movaps	-18 * SIZE(X), %xmm8	SHUFPD_1 %xmm5, %xmm4	SHUFPD_1 %xmm6, %xmm5	SHUFPD_1 %xmm7, %xmm6	SHUFPD_1 %xmm8, %xmm7	mulps	-32 * SIZE(Y), %xmm4	mulps	-28 * SIZE(Y), %xmm5	mulps	-24 * SIZE(Y), %xmm6	mulps	-20 * SIZE(Y), %xmm7	addps	%xmm4, %xmm0	addps	%xmm5, %xmm1	addps	%xmm6, %xmm2	addps	%xmm7, %xmm3	movaps	%xmm8, %xmm4	addq	$16 * SIZE, X	addq	$16 * SIZE, Y	ALIGN_3.L34:	testq	$8, N	jle	.L35	ALIGN_3 	movaps	-30 * SIZE(X), %xmm5	SHUFPD_1 %xmm5, %xmm4	movaps	-26 * SIZE(X), %xmm6	SHUFPD_1 %xmm6, %xmm5	mulps	-32 * SIZE(Y), %xmm4	mulps	-28 * SIZE(Y), %xmm5	addps	%xmm4, %xmm0	addps	%xmm5, %xmm1	movaps	%xmm6, %xmm4	addq	$8 * SIZE, X	addq	$8 * SIZE, Y	ALIGN_3.L35:	testq	$4, N	jle	.L36	ALIGN_3	movaps	-30 * SIZE(X), %xmm5	SHUFPD_1 %xmm5, %xmm4	mulps	-32 * SIZE(Y), %xmm4	addps	%xmm4, %xmm2	movaps	%xmm5, %xmm4	addq	$4 * SIZE, X	addq	$4 * SIZE, Y	ALIGN_3.L36:	testq	$2, N	jle	.L37	ALIGN_3	movsd	-32 * SIZE(X), %xmm4	movsd	-32 * SIZE(Y), %xmm8	mulps	%xmm8,  %xmm4	addps	%xmm4,  %xmm3	addq	$2 * SIZE, X	addq	$2 * SIZE, Y	ALIGN_3.L37:	testq	$1, N	jle	.L999	ALIGN_3	movss	-32 * SIZE(X), %xmm4	mulss	-32 * SIZE(Y), %xmm4	addss	%xmm4, %xmm0	jmp	.L999	ALIGN_3.L40:	movaps	-35 * SIZE(X), %xmm4	movq	N,  %rax	sarq	$5, %rax	jle	.L43	movaps	-31 * SIZE(X), %xmm5	pxor	%xmm12, %xmm12	movaps	-27 * SIZE(X), %xmm6	pxor	%xmm13, %xmm13	movaps	-23 * SIZE(X), %xmm7	pxor	%xmm14, %xmm14	movaps	-19 * SIZE(X), %xmm8	pxor	%xmm15, %xmm15	decq	%rax	jle .L42	ALIGN_3.L41:	addps	%xmm12, %xmm0	movaps	-15 * SIZE(X), %xmm12	addps	%xmm13, %xmm1	movaps	-11 * SIZE(X), %xmm13	addps	%xmm14, %xmm2	movaps	 -7 * SIZE(X), %xmm14	addps	%xmm15, %xmm3	movaps	 -3 * SIZE(X), %xmm15	movaps	%xmm5, %xmm9	palignr	$12, %xmm4, %xmm5	mulps	-32 * SIZE(Y), %xmm5	movaps	%xmm6, %xmm10	palignr	$12, %xmm9, %xmm6	mulps	-28 * SIZE(Y), %xmm6	movaps	%xmm7, %xmm11	palignr	$12, %xmm10, %xmm7	mulps	-24 * SIZE(Y), %xmm7	movaps	%xmm8, %xmm4	palignr	$12, %xmm11, %xmm8	mulps	-20 * SIZE(Y), %xmm8	addps	%xmm5, %xmm0	movaps	  1 * SIZE(X), %xmm5	addps	%xmm6, %xmm1	movaps	  5 * SIZE(X), %xmm6	addps	%xmm7, %xmm2	movaps	  9 * SIZE(X), %xmm7	addps	%xmm8, %xmm3	movaps	 13 * SIZE(X), %xmm8	movaps	%xmm12, %xmm9	palignr	$12, %xmm4, %xmm12	mulps	-16 * SIZE(Y), %xmm12	movaps	%xmm13, %xmm10	palignr	$12, %xmm9, %xmm13	mulps	-12 * SIZE(Y), %xmm13	movaps	%xmm14, %xmm11	palignr	$12, %xmm10, %xmm14	mulps	 -8 * SIZE(Y), %xmm14	subq	$-32 * SIZE, X	movaps	%xmm15, %xmm4	palignr	$12, %xmm11, %xmm15	mulps	 -4 * SIZE(Y), %xmm15	subq	$-32 * SIZE, Y	subq	$1, %rax	jg,pt	.L41	ALIGN_3.L42:	addps	%xmm12, %xmm0	movaps	-15 * SIZE(X), %xmm12	addps	%xmm13, %xmm1	movaps	-11 * SIZE(X), %xmm13	addps	%xmm14, %xmm2	movaps	 -7 * SIZE(X), %xmm14	addps	%xmm15, %xmm3	movaps	 -3 * SIZE(X), %xmm15	movaps	%xmm5, %xmm9	palignr	$12, %xmm4, %xmm5	movaps	%xmm6, %xmm10	palignr	$12, %xmm9, %xmm6	movaps	%xmm7, %xmm11	palignr	$12, %xmm10, %xmm7	movaps	%xmm8, %xmm4	palignr	$12, %xmm11, %xmm8	mulps	-32 * SIZE(Y), %xmm5	mulps	-28 * SIZE(Y), %xmm6	mulps	-24 * SIZE(Y), %xmm7	mulps	-20 * SIZE(Y), %xmm8	addps	%xmm5, %xmm0	addps	%xmm6, %xmm1	addps	%xmm7, %xmm2	addps	%xmm8, %xmm3	movaps	%xmm12, %xmm9	palignr	$12, %xmm4, %xmm12	movaps	%xmm13, %xmm10	palignr	$12, %xmm9, %xmm13	movaps	%xmm14, %xmm11	palignr	$12, %xmm10, %xmm14	movaps	%xmm15, %xmm4	palignr	$12, %xmm11, %xmm15	mulps	-16 * SIZE(Y), %xmm12	mulps	-12 * SIZE(Y), %xmm13	mulps	 -8 * SIZE(Y), %xmm14	mulps	 -4 * SIZE(Y), %xmm15	addps	%xmm12, %xmm0	addps	%xmm13, %xmm1	addps	%xmm14, %xmm2	addps	%xmm15, %xmm3	subq	$-32 * SIZE, X	subq	$-32 * SIZE, Y	ALIGN_3.L43:	testq	$16, N	jle	.L44	ALIGN_3	movaps	-31 * SIZE(X), %xmm5	movaps	-27 * SIZE(X), %xmm6	movaps	-23 * SIZE(X), %xmm7	movaps	-19 * SIZE(X), %xmm8	movaps	%xmm5, %xmm9	movaps	%xmm6, %xmm10	palignr	$12, %xmm4, %xmm5	palignr	$12, %xmm9, %xmm6	movaps	%xmm7, %xmm11	movaps	%xmm8, %xmm4	palignr	$12, %xmm10, %xmm7	palignr	$12, %xmm11, %xmm8	mulps	-32 * SIZE(Y), %xmm5	mulps	-28 * SIZE(Y), %xmm6	mulps	-24 * SIZE(Y), %xmm7	mulps	-20 * SIZE(Y), %xmm8	addps	%xmm5, %xmm0	addps	%xmm6, %xmm1	addps	%xmm7, %xmm2	addps	%xmm8, %xmm3	addq	$16 * SIZE, X	addq	$16 * SIZE, Y	ALIGN_3.L44:	testq	$8, N	jle	.L45	ALIGN_3	movaps	-31 * SIZE(X), %xmm5	movaps	-27 * SIZE(X), %xmm6	movaps	%xmm5, %xmm7	movaps	%xmm6, %xmm8	palignr	$12, %xmm4, %xmm5	palignr	$12, %xmm7, %xmm6	movaps	%xmm8, %xmm4	mulps	-32 * SIZE(Y), %xmm5	mulps	-28 * SIZE(Y), %xmm6	addps	%xmm5,  %xmm0	addps	%xmm6,  %xmm1	addq	$8 * SIZE, X	addq	$8 * SIZE, Y	ALIGN_3.L45:	testq	$4, N	jle	.L46	ALIGN_3	movaps	-31 * SIZE(X), %xmm5	palignr	$12, %xmm4, %xmm5	mulps	-32 * SIZE(Y), %xmm5	addps	%xmm5,  %xmm2	addq	$4 * SIZE, X	addq	$4 * SIZE, Y	ALIGN_3.L46:	testq	$2, N	jle	.L47	ALIGN_3	movsd	-32 * SIZE(X), %xmm4	movsd	-32 * SIZE(Y), %xmm8	mulps	%xmm8,  %xmm4	addps	%xmm4,  %xmm3	addq	$2 * SIZE, X	addq	$2 * SIZE, Y	ALIGN_3.L47:	testq	$1, N	jle	.L999	ALIGN_3	movss	-32 * SIZE(X), %xmm4	mulss	-32 * SIZE(Y), %xmm4	addss	%xmm4,  %xmm0	jmp	.L999	ALIGN_3.L50:#ifdef F_INTERFACE	testq	INCX, INCX	jge	.L51	movq	N, %rax	decq	%rax	imulq	INCX, %rax	subq	%rax, X	ALIGN_3.L51:	testq	INCY, INCY	jge	.L52	movq	N, %rax	decq	%rax	imulq	INCY, %rax	subq	%rax, Y	ALIGN_3.L52:#endif	movq	N,  %rax	sarq	$2, %rax	jle	.L55	ALIGN_3.L53:	movss	0 * SIZE(X), %xmm4	addq	INCX, X	mulss	0 * SIZE(Y), %xmm4	addq	INCY, Y	movss	0 * SIZE(X), %xmm5	addq	INCX, X	mulss	0 * SIZE(Y), %xmm5	addq	INCY, Y	movss	0 * SIZE(X), %xmm6	addq	INCX, X	mulss	0 * SIZE(Y), %xmm6	addq	INCY, Y	movss	0 * SIZE(X), %xmm7	addq	INCX, X	mulss	0 * SIZE(Y), %xmm7	addq	INCY, Y	addss	%xmm4, %xmm0	addss	%xmm5, %xmm1	addss	%xmm6, %xmm2	addss	%xmm7, %xmm3	decq	%rax	jg	.L53	ALIGN_3.L55:	movq	N, %rax	andq	$3,   %rax	jle	.L999	ALIGN_3.L56:	movss	0 * SIZE(X), %xmm4	addq	INCX, X	mulss	0 * SIZE(Y), %xmm4	addq	INCY, Y	addss	%xmm4, %xmm0	decq	%rax	jg	.L56	ALIGN_3.L999:	addps	%xmm1, %xmm0	addps	%xmm3, %xmm2	addps	%xmm2, %xmm0#ifndef HAVE_SSE3	movhlps	%xmm0, %xmm1	addps	%xmm1, %xmm0		movaps	%xmm0, %xmm1	shufps  $1, %xmm0, %xmm0	addss	 %xmm1, %xmm0#else	haddps	%xmm0, %xmm0	haddps	%xmm0, %xmm0#endif#if !defined(DOUBLE) && defined(F_INTERFACE) && defined(NEED_F2CCONV)	cvtss2sd	%xmm0, %xmm0#endif	RESTOREREGISTERS	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -