⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_n_sse2_core2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -16 * SIZE(A2), %xmm10	movapd	 -14 * SIZE(A2), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm13, %xmm10	mulpd	 %xmm13, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm0	addpd	 %xmm11, %xmm1	movapd	 -16 * SIZE(A1, LDA, 2), %xmm8	movapd	 -14 * SIZE(A1, LDA, 2), %xmm9	movapd	 -16 * SIZE(A2, LDA, 2), %xmm10	movapd	 -14 * SIZE(A2, LDA, 2), %xmm11	mulpd	 %xmm14, %xmm8	mulpd	 %xmm14, %xmm9	mulpd	 %xmm15, %xmm10	mulpd	 %xmm15, %xmm11	addpd	 %xmm8, %xmm0	addpd	 %xmm9, %xmm1	addpd	 %xmm10, %xmm0	addpd	 %xmm11, %xmm1	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	addq	 $4 * SIZE, Y1	ALIGN_3.L17:	testq	$2, MM	je	.L18	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -16 * SIZE(A2), %xmm10	mulpd	 %xmm12, %xmm8	mulpd	 %xmm13, %xmm10	addpd	 %xmm8,  %xmm0	addpd	 %xmm10, %xmm0	movapd	 -16 * SIZE(A1, LDA, 2), %xmm8	movapd	 -16 * SIZE(A2, LDA, 2), %xmm10	mulpd	 %xmm14, %xmm8	mulpd	 %xmm15, %xmm10	addpd	 %xmm8, %xmm0	addpd	 %xmm10, %xmm0	movapd	 %xmm0, -16 * SIZE(Y1)	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, A2	addq	 $2 * SIZE, Y1	ALIGN_3.L18:	testq	$1, MM	je	.L19	movsd	 -16 * SIZE(Y1), %xmm0	movsd	 -16 * SIZE(A1), %xmm8	movsd	 -16 * SIZE(A2), %xmm9	movsd	 -16 * SIZE(A1, LDA, 2), %xmm10	movsd	 -16 * SIZE(A2, LDA, 2), %xmm11	mulsd	 %xmm12, %xmm8	mulsd	 %xmm13, %xmm9	mulsd	 %xmm14, %xmm10	mulsd	 %xmm15, %xmm11	addsd	 %xmm8,  %xmm0	addsd	 %xmm9,  %xmm0	addsd	 %xmm10, %xmm0	addsd	 %xmm11, %xmm0	movsd	 %xmm0, -16 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L19:	decq	J	jg	.L11	ALIGN_3.L20:	testq	$2, N	je	.L30	leaq	16 * SIZE(BUFFER), Y1	movq	A,  A1	leaq	(A, LDA, 1), A2	leaq	(A, LDA, 2), A	movsd	(X), %xmm12	addq	INCX, X	movsd	(X), %xmm13	addq	INCX, X	mulsd	STACK_ALPHA, %xmm12	mulsd	STACK_ALPHA, %xmm13	unpcklpd %xmm12, %xmm12	unpcklpd %xmm13, %xmm13	testq	$SIZE, A	je	.L22	movsd	 -16 * SIZE(Y1), %xmm0	movsd	 -16 * SIZE(A1), %xmm8	movsd	 -16 * SIZE(A2), %xmm9	mulsd	 %xmm12, %xmm8	mulsd	 %xmm13, %xmm9	addsd	 %xmm8,  %xmm0	addsd	 %xmm9,  %xmm0	movsd	 %xmm0, -16 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L22:	movq	MM,  I	sarq	$4, I	jle	.L25	ALIGN_3.L23:	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -14 * SIZE(Y1), %xmm1	movapd	 -12 * SIZE(Y1), %xmm2	movapd	 -10 * SIZE(Y1), %xmm3	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -12 * SIZE(A1), %xmm10	movapd	 -10 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 -16 * SIZE(A2), %xmm8	movapd	 -14 * SIZE(A2), %xmm9	movapd	 -12 * SIZE(A2), %xmm10	movapd	 -10 * SIZE(A2), %xmm11	mulpd	 %xmm13, %xmm8	mulpd	 %xmm13, %xmm9	mulpd	 %xmm13, %xmm10	mulpd	 %xmm13, %xmm11	addpd	 %xmm8, %xmm0	addpd	 %xmm9, %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	movapd	 %xmm2, -12 * SIZE(Y1)	movapd	 %xmm3, -10 * SIZE(Y1)	movapd	-8 * SIZE(Y1), %xmm0	movapd	-6 * SIZE(Y1), %xmm1	movapd	-4 * SIZE(Y1), %xmm2	movapd	-2 * SIZE(Y1), %xmm3	movapd	 -8 * SIZE(A1), %xmm8	movapd	 -6 * SIZE(A1), %xmm9	movapd	 -4 * SIZE(A1), %xmm10	movapd	 -2 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 -8 * SIZE(A2), %xmm8	movapd	 -6 * SIZE(A2), %xmm9	movapd	 -4 * SIZE(A2), %xmm10	movapd	 -2 * SIZE(A2), %xmm11	mulpd	 %xmm13, %xmm8	mulpd	 %xmm13, %xmm9	mulpd	 %xmm13, %xmm10	mulpd	 %xmm13, %xmm11	addpd	 %xmm8, %xmm0	addpd	 %xmm9, %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -8 * SIZE(Y1)	movapd	 %xmm1, -6 * SIZE(Y1)	movapd	 %xmm2, -4 * SIZE(Y1)	movapd	 %xmm3, -2 * SIZE(Y1)	subq	 $-16 * SIZE, A1	subq	 $-16 * SIZE, A2	subq	 $-16 * SIZE, Y1	subq	 $1, I	jg	.L23	ALIGN_3.L25:	testq	$8, MM	je	.L26	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -14 * SIZE(Y1), %xmm1	movapd	 -12 * SIZE(Y1), %xmm2	movapd	 -10 * SIZE(Y1), %xmm3	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -12 * SIZE(A1), %xmm10	movapd	 -10 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 -16 * SIZE(A2), %xmm8	movapd	 -14 * SIZE(A2), %xmm9	movapd	 -12 * SIZE(A2), %xmm10	movapd	 -10 * SIZE(A2), %xmm11	mulpd	 %xmm13, %xmm8	mulpd	 %xmm13, %xmm9	mulpd	 %xmm13, %xmm10	mulpd	 %xmm13, %xmm11	addpd	 %xmm8, %xmm0	addpd	 %xmm9, %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	movapd	 %xmm2, -12 * SIZE(Y1)	movapd	 %xmm3, -10 * SIZE(Y1)	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, A2	addq	 $8 * SIZE, Y1	ALIGN_3.L26:	testq	$4, MM	je	.L27	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -14 * SIZE(Y1), %xmm1	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -16 * SIZE(A2), %xmm10	movapd	 -14 * SIZE(A2), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm13, %xmm10	mulpd	 %xmm13, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm0	addpd	 %xmm11, %xmm1	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	addq	 $4 * SIZE, Y1	ALIGN_3.L27:	testq	$2, MM	je	.L28	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -16 * SIZE(A2), %xmm10	mulpd	 %xmm12, %xmm8	mulpd	 %xmm13, %xmm10	addpd	 %xmm8,  %xmm0	addpd	 %xmm10, %xmm0	movapd	 %xmm0, -16 * SIZE(Y1)	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, A2	addq	 $2 * SIZE, Y1	ALIGN_3.L28:	testq	$1, MM	je	.L30	movsd	-16 * SIZE(Y1), %xmm0	movsd	-16 * SIZE(A1), %xmm8	movsd	-16 * SIZE(A2), %xmm9	mulsd	 %xmm12, %xmm8	mulsd	 %xmm13, %xmm9	addsd	 %xmm8,  %xmm0	addsd	 %xmm9,  %xmm0	movsd	 %xmm0, -16 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L30:	testq	$1, N	je	.L990	leaq	16 * SIZE(BUFFER), Y1	movq	A,  A1	movsd	(X), %xmm12	mulsd	STACK_ALPHA, %xmm12	unpcklpd %xmm12, %xmm12	testq	$SIZE, A	je	.L32	movsd	 -16 * SIZE(Y1), %xmm0	movsd	 -16 * SIZE(A1), %xmm8	mulsd	 %xmm12, %xmm8	addsd	 %xmm8,  %xmm0	movsd	 %xmm0, -16 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, Y1	ALIGN_3.L32:	movq	MM,  I	sarq	$4, I	jle	.L35	ALIGN_3.L33:	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -14 * SIZE(Y1), %xmm1	movapd	 -12 * SIZE(Y1), %xmm2	movapd	 -10 * SIZE(Y1), %xmm3	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -12 * SIZE(A1), %xmm10	movapd	 -10 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	movapd	 %xmm2, -12 * SIZE(Y1)	movapd	 %xmm3, -10 * SIZE(Y1)	movapd	-8 * SIZE(Y1), %xmm0	movapd	-6 * SIZE(Y1), %xmm1	movapd	-4 * SIZE(Y1), %xmm2	movapd	-2 * SIZE(Y1), %xmm3	movapd	 -8 * SIZE(A1), %xmm8	movapd	 -6 * SIZE(A1), %xmm9	movapd	 -4 * SIZE(A1), %xmm10	movapd	 -2 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -8 * SIZE(Y1)	movapd	 %xmm1, -6 * SIZE(Y1)	movapd	 %xmm2, -4 * SIZE(Y1)	movapd	 %xmm3, -2 * SIZE(Y1)	subq	 $-16 * SIZE, A1	subq	 $-16 * SIZE, Y1	decq	 I	jg	.L33	ALIGN_3.L35:	testq	$8, MM	je	.L36	movapd	-16 * SIZE(Y1), %xmm0	movapd	-14 * SIZE(Y1), %xmm1	movapd	-12 * SIZE(Y1), %xmm2	movapd	-10 * SIZE(Y1), %xmm3	movapd	 -16 * SIZE(A1), %xmm8	movapd	 -14 * SIZE(A1), %xmm9	movapd	 -12 * SIZE(A1), %xmm10	movapd	 -10 * SIZE(A1), %xmm11	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	mulpd	 %xmm12, %xmm10	mulpd	 %xmm12, %xmm11	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	addpd	 %xmm10, %xmm2	addpd	 %xmm11, %xmm3	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	movapd	 %xmm2, -12 * SIZE(Y1)	movapd	 %xmm3, -10 * SIZE(Y1)	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, Y1	ALIGN_3.L36:	testq	$4, MM	je	.L37	movapd	-16 * SIZE(Y1), %xmm0	movapd	-14 * SIZE(Y1), %xmm1	movapd	-16 * SIZE(A1), %xmm8	movapd	-14 * SIZE(A1), %xmm9	mulpd	 %xmm12, %xmm8	mulpd	 %xmm12, %xmm9	addpd	 %xmm8,  %xmm0	addpd	 %xmm9,  %xmm1	movapd	 %xmm0, -16 * SIZE(Y1)	movapd	 %xmm1, -14 * SIZE(Y1)	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, Y1	ALIGN_3.L37:	testq	$2, MM	je	.L38	movapd	 -16 * SIZE(Y1), %xmm0	movapd	 -16 * SIZE(A1), %xmm8	mulpd	 %xmm12, %xmm8	addpd	 %xmm8,  %xmm0	movapd	 %xmm0, -16 * SIZE(Y1)	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, Y1	ALIGN_3.L38:	testq	$1, MM	je	.L990	movsd	 -16 * SIZE(Y1), %xmm0	movsd	 -16 * SIZE(A1), %xmm8	mulsd	 %xmm12, %xmm8	addsd	 %xmm8,  %xmm0	movsd	 %xmm0, -16 * SIZE(Y1)	jmp	 .L990	ALIGN_3.L40:	movq	N,  J	sarq	$2, J	jle	.L50	ALIGN_3.L41:	movq	BUFFER, Y1	movq	A,  A1	leaq	(A, LDA, 1), A2	leaq	(A, LDA, 4), A	movsd	(X), %xmm0	addq	INCX, X	movsd	(X), %xmm1	addq	INCX, X	movsd	(X), %xmm2	addq	INCX, X	movsd	(X), %xmm3	addq	INCX, X	mulsd	STACK_ALPHA, %xmm0	mulsd	STACK_ALPHA, %xmm1	mulsd	STACK_ALPHA, %xmm2	mulsd	STACK_ALPHA, %xmm3	unpcklpd %xmm0, %xmm0	unpcklpd %xmm1, %xmm1	unpcklpd %xmm2, %xmm2	unpcklpd %xmm3, %xmm3	ALIGN_3	testq	$SIZE, A	je	.L42	movsd	 0 * SIZE(Y1), %xmm12	movsd	-16 * SIZE(A1), %xmm8	movsd	-16 * SIZE(A2), %xmm9	movsd	-16 * SIZE(A1, LDA, 2), %xmm10	movsd	-16 * SIZE(A2, LDA, 2), %xmm11	mulsd	 %xmm0, %xmm8	mulsd	 %xmm1, %xmm9	mulsd	 %xmm2, %xmm10	mulsd	 %xmm3, %xmm11	addsd	 %xmm8,  %xmm12	addsd	 %xmm9,  %xmm12	addsd	 %xmm10, %xmm12	addsd	 %xmm11, %xmm12	movsd	 %xmm12, 0 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L42:	movq	MM,  I	sarq	$4, I	jle	.L45	movapd	-16 * SIZE(A1), %xmm8	movapd	-14 * SIZE(A1), %xmm9	movapd	-12 * SIZE(A1), %xmm10	movapd	-10 * SIZE(A1), %xmm11	movsd	-16 * SIZE(A2), %xmm12	movhpd	-15 * SIZE(A2), %xmm12	movsd	-14 * SIZE(A2), %xmm13	movhpd	-13 * SIZE(A2), %xmm13	movsd	-12 * SIZE(A2), %xmm14	movhpd	-11 * SIZE(A2), %xmm14	movsd	-10 * SIZE(A2), %xmm15	movhpd	 -9 * SIZE(A2), %xmm15	movapd	 0 * SIZE(Y1), %xmm4	movapd	 2 * SIZE(Y1), %xmm5	movapd	 4 * SIZE(Y1), %xmm6	movapd	 6 * SIZE(Y1), %xmm7	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm0, %xmm10	mulpd	 %xmm0, %xmm11	decq	 I	jle	 .L44

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -