⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 axpy_sse2_core2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	movaps	-16 * SIZE(Y), %xmm4	mulpd	ALPHA, %xmm0	addpd	%xmm4, %xmm0	movaps	%xmm0, -16 * SIZE(Y)	addq	$2 * SIZE, X	addq	$2 * SIZE, Y	ALIGN_3.L18:	testq	$1, M	jle	.L19	movsd	-16 * SIZE(X), %xmm0	movsd	-16 * SIZE(Y), %xmm4	mulsd	ALPHA, %xmm0	addsd	%xmm4, %xmm0	movsd	%xmm0, -16 * SIZE(Y)	ALIGN_3.L19:	xorq	%rax,%rax	RESTOREREGISTERS	ret	ALIGN_3.L20:	movaps	-1 * SIZE(X), %xmm0	movq	M,  %rax	sarq	$4, %rax	jle	.L23	movaps	 1 * SIZE(X), %xmm1	movaps	 3 * SIZE(X), %xmm2	movaps	 5 * SIZE(X), %xmm3	movaps	 7 * SIZE(X), %xmm8	movaps	 9 * SIZE(X), %xmm9	movaps	11 * SIZE(X), %xmm10	movaps	13 * SIZE(X), %xmm11	movaps	15 * SIZE(X), %xmm12	movaps	0 * SIZE(Y), %xmm4	movaps	2 * SIZE(Y), %xmm5	movaps	4 * SIZE(Y), %xmm6	movaps	6 * SIZE(Y), %xmm14	decq	%rax	jle .L22	ALIGN_4.L21:#ifdef PENRYN	shufps	$0x4e, %xmm1, %xmm0	shufps	$0x4e, %xmm2, %xmm1	shufps	$0x4e, %xmm3, %xmm2	shufps	$0x4e, %xmm8, %xmm3	shufps	$0x4e, %xmm9,  %xmm8	shufps	$0x4e, %xmm10, %xmm9	shufps	$0x4e, %xmm11, %xmm10	shufps	$0x4e, %xmm12, %xmm11#else	SHUFPD_1 %xmm1, %xmm0	SHUFPD_1 %xmm2, %xmm1	SHUFPD_1 %xmm3, %xmm2	SHUFPD_1 %xmm8, %xmm3	SHUFPD_1 %xmm9,  %xmm8	SHUFPD_1 %xmm10, %xmm9	SHUFPD_1 %xmm11, %xmm10	SHUFPD_1 %xmm12, %xmm11#endif	mulpd	ALPHA, %xmm0	mulpd	ALPHA, %xmm1	mulpd	ALPHA, %xmm2	mulpd	ALPHA, %xmm3	mulpd	ALPHA, %xmm8	mulpd	ALPHA, %xmm9	mulpd	ALPHA, %xmm10	mulpd	ALPHA, %xmm11	addpd	%xmm4, %xmm0	movaps	 8 * SIZE(Y), %xmm4	addpd	%xmm5, %xmm1	movaps	10 * SIZE(Y), %xmm5	addpd	%xmm6, %xmm2	movaps	12 * SIZE(Y), %xmm6	addpd	%xmm14, %xmm3	movaps	14 * SIZE(Y), %xmm14	movaps	%xmm0, 0 * SIZE(Y)	movaps	%xmm1, 2 * SIZE(Y)	movaps	%xmm2, 4 * SIZE(Y)	movaps	%xmm3, 6 * SIZE(Y)	movaps	%xmm12, %xmm0	movaps	17 * SIZE(X), %xmm1	movaps	19 * SIZE(X), %xmm2	movaps	21 * SIZE(X), %xmm3	movaps	31 * SIZE(X), %xmm12	addpd	%xmm4, %xmm8	addpd	%xmm5, %xmm9	addpd	%xmm6, %xmm10	addpd	%xmm14, %xmm11	movaps	16 * SIZE(Y), %xmm4	movaps	18 * SIZE(Y), %xmm5	movaps	20 * SIZE(Y), %xmm6	movaps	22 * SIZE(Y), %xmm14	movaps	%xmm8,   8 * SIZE(Y)	movaps	%xmm9,  10 * SIZE(Y)	movaps	%xmm10, 12 * SIZE(Y)	movaps	%xmm11, 14 * SIZE(Y)	movaps	23 * SIZE(X), %xmm8	movaps	25 * SIZE(X), %xmm9	movaps	27 * SIZE(X), %xmm10	movaps	29 * SIZE(X), %xmm11	addq	$16 * SIZE, X	addq	$16 * SIZE, Y	decq	%rax	jg	.L21	ALIGN_3.L22:#ifdef PENRYN	shufps	$0x4e, %xmm1, %xmm0	shufps	$0x4e, %xmm2, %xmm1	shufps	$0x4e, %xmm3, %xmm2	shufps	$0x4e, %xmm8, %xmm3	shufps	$0x4e, %xmm9,  %xmm8	shufps	$0x4e, %xmm10, %xmm9	shufps	$0x4e, %xmm11, %xmm10	shufps	$0x4e, %xmm12, %xmm11#else	SHUFPD_1 %xmm1, %xmm0	SHUFPD_1 %xmm2, %xmm1	SHUFPD_1 %xmm3, %xmm2	SHUFPD_1 %xmm8, %xmm3	SHUFPD_1 %xmm9,  %xmm8	SHUFPD_1 %xmm10, %xmm9	SHUFPD_1 %xmm11, %xmm10	SHUFPD_1 %xmm12, %xmm11#endif	mulpd	ALPHA, %xmm0	mulpd	ALPHA, %xmm1	mulpd	ALPHA, %xmm2	mulpd	ALPHA, %xmm3	mulpd	ALPHA, %xmm8	mulpd	ALPHA, %xmm9	mulpd	ALPHA, %xmm10	mulpd	ALPHA, %xmm11	addpd	%xmm4, %xmm0	movaps	 8 * SIZE(Y), %xmm4	addpd	%xmm5, %xmm1	movaps	10 * SIZE(Y), %xmm5	addpd	%xmm6, %xmm2	movaps	12 * SIZE(Y), %xmm6	addpd	%xmm14, %xmm3	movaps	14 * SIZE(Y), %xmm14	movaps	%xmm0, 0 * SIZE(Y)	movaps	%xmm1, 2 * SIZE(Y)	movaps	%xmm2, 4 * SIZE(Y)	movaps	%xmm3, 6 * SIZE(Y)	addpd	%xmm4, %xmm8	addpd	%xmm5, %xmm9	addpd	%xmm6, %xmm10	addpd	%xmm14, %xmm11	movaps	%xmm8,  8 * SIZE(Y)	movaps	%xmm9, 10 * SIZE(Y)	movaps	%xmm10, 12 * SIZE(Y)	movaps	%xmm11, 14 * SIZE(Y)	movaps	%xmm12, %xmm0	addq	$16 * SIZE, X	addq	$16 * SIZE, Y	ALIGN_3.L23:	movq	M,  %rax	andq	$8, %rax	jle	.L24	ALIGN_3	movaps	1 * SIZE(X), %xmm1	movaps	3 * SIZE(X), %xmm2	movaps	5 * SIZE(X), %xmm3	movaps	7 * SIZE(X), %xmm8	movaps	0 * SIZE(Y), %xmm4	movaps	2 * SIZE(Y), %xmm5	movaps	4 * SIZE(Y), %xmm6	movaps	6 * SIZE(Y), %xmm14#ifdef PENRYN	shufps	$0x4e, %xmm1, %xmm0	shufps	$0x4e, %xmm2, %xmm1	shufps	$0x4e, %xmm3, %xmm2	shufps	$0x4e, %xmm8, %xmm3#else	SHUFPD_1 %xmm1, %xmm0	SHUFPD_1 %xmm2, %xmm1	SHUFPD_1 %xmm3, %xmm2	SHUFPD_1 %xmm8, %xmm3#endif	mulpd	ALPHA, %xmm0	mulpd	ALPHA, %xmm1	mulpd	ALPHA, %xmm2	mulpd	ALPHA, %xmm3	addpd	%xmm4, %xmm0	addpd	%xmm5, %xmm1	addpd	%xmm6, %xmm2	addpd	%xmm14, %xmm3	movaps	%xmm0, 0 * SIZE(Y)	movaps	%xmm1, 2 * SIZE(Y)	movaps	%xmm2, 4 * SIZE(Y)	movaps	%xmm3, 6 * SIZE(Y)	movaps	%xmm8, %xmm0	addq	$8 * SIZE, X	addq	$8 * SIZE, Y	ALIGN_3.L24:	movq	M,  %rax	andq	$4, %rax	jle	.L25	ALIGN_3	movaps	1 * SIZE(X), %xmm1	movaps	3 * SIZE(X), %xmm2	movaps	0 * SIZE(Y), %xmm4	movaps	2 * SIZE(Y), %xmm5#ifdef PENRYN	shufps	$0x4e, %xmm1, %xmm0	shufps	$0x4e, %xmm2, %xmm1#else	SHUFPD_1 %xmm1, %xmm0	SHUFPD_1 %xmm2, %xmm1#endif	mulpd	ALPHA, %xmm0	mulpd	ALPHA, %xmm1	addpd	%xmm4, %xmm0	addpd	%xmm5, %xmm1	movaps	%xmm0, 0 * SIZE(Y)	movaps	%xmm1, 2 * SIZE(Y)	movaps	%xmm2, %xmm0	addq	$4 * SIZE, X	addq	$4 * SIZE, Y	ALIGN_3.L25:	movq	M,  %rax	andq	$2, %rax	jle	.L26	ALIGN_3	movaps	1 * SIZE(X), %xmm1	movaps	0 * SIZE(Y), %xmm4#ifdef PENRYN	shufps	$0x4e, %xmm1, %xmm0#else	SHUFPD_1 %xmm1, %xmm0#endif	mulpd	ALPHA, %xmm0	addpd	%xmm4, %xmm0	movaps	%xmm0, 0 * SIZE(Y)	addq	$2 * SIZE, X	addq	$2 * SIZE, Y	ALIGN_3.L26:	movq	M,  %rax	andq	$1, %rax	jle	.L29	ALIGN_3	movsd	0 * SIZE(X), %xmm0	mulsd	ALPHA, %xmm0	addsd	0 * SIZE(Y), %xmm0	movsd	%xmm0, 	0 * SIZE(Y)	addq	$SIZE, Y	ALIGN_3.L29:	xorq	%rax,%rax	RESTOREREGISTERS	ret	ALIGN_3.L40:	movq	Y, YY	movq	M,  %rax	sarq	$3, %rax	jle	.L45	ALIGN_3.L41:	movsd	0 * SIZE(X), %xmm0	addq	INCX, X	movhpd	0 * SIZE(X), %xmm0	addq	INCX, X	mulpd	ALPHA, %xmm0	movsd	0 * SIZE(YY), %xmm6	addq	INCY, YY	movhpd	0 * SIZE(YY), %xmm6	addq	INCY, YY	addpd	%xmm6, %xmm0	movsd	0 * SIZE(X), %xmm1	addq	INCX, X	movhpd	0 * SIZE(X), %xmm1	addq	INCX, X	mulpd	ALPHA, %xmm1	movsd	0 * SIZE(YY), %xmm6	addq	INCY, YY	movhpd	0 * SIZE(YY), %xmm6	addq	INCY, YY	addpd	%xmm6, %xmm1	movsd	0 * SIZE(X), %xmm2	addq	INCX, X	movhpd	0 * SIZE(X), %xmm2	addq	INCX, X	mulpd	ALPHA, %xmm2	movsd	0 * SIZE(YY), %xmm6	addq	INCY, YY	movhpd	0 * SIZE(YY), %xmm6	addq	INCY, YY	addpd	%xmm6, %xmm2	movsd	0 * SIZE(X), %xmm3	addq	INCX, X	movhpd	0 * SIZE(X), %xmm3	addq	INCX, X	mulpd	ALPHA, %xmm3	movsd	0 * SIZE(YY), %xmm6	addq	INCY, YY	movhpd	0 * SIZE(YY), %xmm6	addq	INCY, YY	addpd	%xmm6, %xmm3	movsd	%xmm0, 0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm0, 0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm1, 0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm1, 0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm2, 0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm2, 0 * SIZE(Y)	addq	INCY, Y	movsd	%xmm3, 0 * SIZE(Y)	addq	INCY, Y	movhpd	%xmm3, 0 * SIZE(Y)	addq	INCY, Y	decq	%rax	jg	.L41	ALIGN_3.L45:	movq	M,  %rax	andq	$7, %rax	jle	.L47	ALIGN_3.L46:	movsd	(X), %xmm0	addq	INCX, X	mulsd	ALPHA, %xmm0	addsd	(Y), %xmm0	movsd	%xmm0, (Y)	addq	INCY, Y	decq	%rax	jg	.L46	ALIGN_3.L47:	xorq	%rax, %rax	RESTOREREGISTERS	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -