⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_n_sse2_barcelona.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L19:	decq	J	jg	.L11	ALIGN_3.L20:	testq	$2, N	je	.L30	movq	YY, Y1	movq	A,  A1	leaq	(A, LDA, 1), A2	leaq	(A, LDA, 2), A	movlpd	(X), %xmm0	addq	INCX, X	movlpd	(X), %xmm1	addq	INCX, X	mulsd	STACK_ALPHA, %xmm0	mulsd	STACK_ALPHA, %xmm1	unpcklpd %xmm0, %xmm0	unpcklpd %xmm1, %xmm1	movq	M,  I	sarq	$4, I	jle	.L25	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 4 * SIZE(A1), %xmm10	movupd	 6 * SIZE(A1), %xmm11	movupd	 0 * SIZE(A2), %xmm12	movupd	 2 * SIZE(A2), %xmm13	movupd	 4 * SIZE(A2), %xmm14	movupd	 6 * SIZE(A2), %xmm15	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	movupd	 4 * SIZE(Y1), %xmm6	movupd	 6 * SIZE(Y1), %xmm7	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm0, %xmm10	mulpd	 %xmm0, %xmm11	decq	 I	jle	 .L22	ALIGN_3.L21:	PREFETCH	PREFETCHSIZE * SIZE(A1)	addpd	 %xmm8,  %xmm4	movupd	 8 * SIZE(A1), %xmm8	mulpd	 %xmm1, %xmm12	addpd	 %xmm9,  %xmm5	movupd	10 * SIZE(A1), %xmm9	mulpd	 %xmm1, %xmm13	addpd	 %xmm10, %xmm6	movupd	12 * SIZE(A1), %xmm10	mulpd	 %xmm1, %xmm14	addpd	 %xmm11, %xmm7	movupd	14 * SIZE(A1), %xmm11	mulpd	 %xmm1, %xmm15	PREFETCH	PREFETCHSIZE * SIZE(Y1)	addpd	 %xmm12, %xmm4	movupd	 8 * SIZE(A2), %xmm12	mulpd	 %xmm0, %xmm8	addpd	 %xmm13, %xmm5	movupd	10 * SIZE(A2), %xmm13	mulpd	 %xmm0, %xmm9	addpd	 %xmm14, %xmm6	movupd	12 * SIZE(A2), %xmm14	mulpd	 %xmm0, %xmm10	addpd	 %xmm15, %xmm7	movupd	14 * SIZE(A2), %xmm15	mulpd	 %xmm0, %xmm11	movlpd	 %xmm4, 0 * SIZE(Y1)	movhpd	 %xmm4, 1 * SIZE(Y1)	movupd	 8 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 2 * SIZE(Y1)	movhpd	 %xmm5, 3 * SIZE(Y1)	movupd	10 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 4 * SIZE(Y1)	movhpd	 %xmm6, 5 * SIZE(Y1)	movupd	12 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 6 * SIZE(Y1)	movhpd	 %xmm7, 7 * SIZE(Y1)	movupd	14 * SIZE(Y1), %xmm7	PREFETCH	PREFETCHSIZE * SIZE(A2)	addpd	 %xmm8,  %xmm4	movupd	16 * SIZE(A1), %xmm8	mulpd	 %xmm1, %xmm12	addpd	 %xmm9,  %xmm5	movupd	18 * SIZE(A1), %xmm9	mulpd	 %xmm1, %xmm13	addpd	 %xmm10, %xmm6	movupd	20 * SIZE(A1), %xmm10	mulpd	 %xmm1, %xmm14	addpd	 %xmm11, %xmm7	movupd	22 * SIZE(A1), %xmm11	mulpd	 %xmm1, %xmm15	addpd	 %xmm12, %xmm4	movupd	 16 * SIZE(A2), %xmm12	mulpd	 %xmm0, %xmm8	addpd	 %xmm13, %xmm5	movupd	 18 * SIZE(A2), %xmm13	mulpd	 %xmm0, %xmm9	addpd	 %xmm14, %xmm6	movupd	 20 * SIZE(A2), %xmm14	mulpd	 %xmm0, %xmm10	addpd	 %xmm15, %xmm7	movupd	 22 * SIZE(A2), %xmm15	mulpd	 %xmm0, %xmm11	movlpd	 %xmm4,  8 * SIZE(Y1)	movhpd	 %xmm4,  9 * SIZE(Y1)	movupd	16 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 10 * SIZE(Y1)	movhpd	 %xmm5, 11 * SIZE(Y1)	movupd	18 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 12 * SIZE(Y1)	movhpd	 %xmm6, 13 * SIZE(Y1)	movupd	20 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 14 * SIZE(Y1)	movhpd	 %xmm7, 15 * SIZE(Y1)	movupd	22 * SIZE(Y1), %xmm7	subq	$-16 * SIZE, A1	subq	$-16 * SIZE, A2	subq	$-16 * SIZE, Y1	decq	 I	jg	.L21	ALIGN_3.L22:	addpd	 %xmm8,  %xmm4	movupd	 8 * SIZE(A1), %xmm8	mulpd	 %xmm1, %xmm12	addpd	 %xmm9,  %xmm5	movupd	10 * SIZE(A1), %xmm9	mulpd	 %xmm1, %xmm13	addpd	 %xmm10, %xmm6	movupd	12 * SIZE(A1), %xmm10	mulpd	 %xmm1, %xmm14	addpd	 %xmm11, %xmm7	movupd	14 * SIZE(A1), %xmm11	mulpd	 %xmm1, %xmm15	addpd	 %xmm12, %xmm4	movupd	 8 * SIZE(A2), %xmm12	mulpd	 %xmm0, %xmm8	addpd	 %xmm13, %xmm5	movupd	10 * SIZE(A2), %xmm13	mulpd	 %xmm0, %xmm9	addpd	 %xmm14, %xmm6	movupd	12 * SIZE(A2), %xmm14	mulpd	 %xmm0, %xmm10	addpd	 %xmm15, %xmm7	movupd	14 * SIZE(A2), %xmm15	mulpd	 %xmm0, %xmm11	movlpd	 %xmm4, 0 * SIZE(Y1)	movhpd	 %xmm4, 1 * SIZE(Y1)	movupd	 8 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 2 * SIZE(Y1)	movhpd	 %xmm5, 3 * SIZE(Y1)	movupd	10 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 4 * SIZE(Y1)	movhpd	 %xmm6, 5 * SIZE(Y1)	movupd	12 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 6 * SIZE(Y1)	movhpd	 %xmm7, 7 * SIZE(Y1)	movupd	14 * SIZE(Y1), %xmm7	addpd	 %xmm8,  %xmm4	mulpd	 %xmm1, %xmm12	addpd	 %xmm9,  %xmm5	mulpd	 %xmm1, %xmm13	addpd	 %xmm10, %xmm6	mulpd	 %xmm1, %xmm14	addpd	 %xmm11, %xmm7	mulpd	 %xmm1, %xmm15	addpd	 %xmm12, %xmm4	addpd	 %xmm13, %xmm5	addpd	 %xmm14, %xmm6	addpd	 %xmm15, %xmm7	movlpd	 %xmm4,  8 * SIZE(Y1)	movhpd	 %xmm4,  9 * SIZE(Y1)	movlpd	 %xmm5, 10 * SIZE(Y1)	movhpd	 %xmm5, 11 * SIZE(Y1)	movlpd	 %xmm6, 12 * SIZE(Y1)	movhpd	 %xmm6, 13 * SIZE(Y1)	movlpd	 %xmm7, 14 * SIZE(Y1)	movhpd	 %xmm7, 15 * SIZE(Y1)	subq	$-16 * SIZE, A1	subq	$-16 * SIZE, A2	subq	$-16 * SIZE, Y1	ALIGN_3.L25:	testq	$8, M	je	.L26	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 4 * SIZE(A1), %xmm10	movupd	 6 * SIZE(A1), %xmm11	movupd	 0 * SIZE(A2), %xmm12	movupd	 2 * SIZE(A2), %xmm13	movupd	 4 * SIZE(A2), %xmm14	movupd	 6 * SIZE(A2), %xmm15	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	movupd	 4 * SIZE(Y1), %xmm6	movupd	 6 * SIZE(Y1), %xmm7	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm0, %xmm10	mulpd	 %xmm0, %xmm11	addpd	 %xmm8,  %xmm4	mulpd	 %xmm1, %xmm12	addpd	 %xmm9,  %xmm5	mulpd	 %xmm1, %xmm13	addpd	 %xmm10, %xmm6	mulpd	 %xmm1, %xmm14	addpd	 %xmm11, %xmm7	mulpd	 %xmm1, %xmm15	addpd	 %xmm12, %xmm4	addpd	 %xmm13, %xmm5	addpd	 %xmm14, %xmm6	addpd	 %xmm15, %xmm7	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	movlpd	 %xmm5,  2 * SIZE(Y1)	movhpd	 %xmm5,  3 * SIZE(Y1)	movlpd	 %xmm6,  4 * SIZE(Y1)	movhpd	 %xmm6,  5 * SIZE(Y1)	movlpd	 %xmm7,  6 * SIZE(Y1)	movhpd	 %xmm7,  7 * SIZE(Y1)	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, A2	addq	 $8 * SIZE, Y1	ALIGN_3.L26:	testq	$4, M	je	.L27	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 0 * SIZE(A2), %xmm10	movupd	 2 * SIZE(A2), %xmm11	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm1, %xmm10	mulpd	 %xmm1, %xmm11	addpd	 %xmm8,  %xmm4	addpd	 %xmm9,  %xmm5	addpd	 %xmm10, %xmm4	addpd	 %xmm11, %xmm5	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	movlpd	 %xmm5,  2 * SIZE(Y1)	movhpd	 %xmm5,  3 * SIZE(Y1)	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	addq	 $4 * SIZE, Y1	ALIGN_3.L27:	testq	$2, M	je	.L28	movupd	 0 * SIZE(A1), %xmm8	movupd	 0 * SIZE(A2), %xmm10	movupd	 0 * SIZE(Y1), %xmm4	mulpd	 %xmm0, %xmm8	mulpd	 %xmm1, %xmm10	addpd	 %xmm8,  %xmm4	addpd	 %xmm10, %xmm4	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, A2	addq	 $2 * SIZE, Y1	ALIGN_3.L28:	testq	$1, M	je	.L30	movlpd	 0 * SIZE(A1), %xmm8	movlpd	 0 * SIZE(A2), %xmm9	movlpd	 0 * SIZE(Y1), %xmm4	mulsd	 %xmm0, %xmm8	mulsd	 %xmm1, %xmm9	addsd	 %xmm8,  %xmm4	addsd	 %xmm9,  %xmm4	movlpd	 %xmm4, 0 * SIZE(Y1)	addq	 $1 * SIZE, A1	addq	 $1 * SIZE, A2	addq	 $1 * SIZE, Y1	ALIGN_3.L30:	testq	$1, N	je	.L995	movq	YY, Y1	movq	A,  A1	movlpd	(X), %xmm0	mulsd	STACK_ALPHA, %xmm0	unpcklpd %xmm0, %xmm0	movq	M,  I	sarq	$4, I	jle	.L35	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 4 * SIZE(A1), %xmm10	movupd	 6 * SIZE(A1), %xmm11	movupd	 8 * SIZE(A1), %xmm12	movupd	10 * SIZE(A1), %xmm13	movupd	12 * SIZE(A1), %xmm14	movupd	14 * SIZE(A1), %xmm15	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm0, %xmm10	mulpd	 %xmm0, %xmm11	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	movupd	 4 * SIZE(Y1), %xmm6	movupd	 6 * SIZE(Y1), %xmm7	decq	 I	jle	 .L32	ALIGN_3.L31:	PREFETCH	PREFETCHSIZE * SIZE(A1)	addpd	 %xmm8,  %xmm4	movupd	16 * SIZE(A1), %xmm8	mulpd	 %xmm0, %xmm12	addpd	 %xmm9,  %xmm5	movupd	18 * SIZE(A1), %xmm9	mulpd	 %xmm0, %xmm13	addpd	 %xmm10, %xmm6	movupd	20 * SIZE(A1), %xmm10	mulpd	 %xmm0, %xmm14	addpd	 %xmm11, %xmm7	movupd	22 * SIZE(A1), %xmm11	mulpd	 %xmm0, %xmm15	movlpd	 %xmm4, 0 * SIZE(Y1)	movhpd	 %xmm4, 1 * SIZE(Y1)	movupd	 8 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 2 * SIZE(Y1)	movhpd	 %xmm5, 3 * SIZE(Y1)	movupd	10 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 4 * SIZE(Y1)	movhpd	 %xmm6, 5 * SIZE(Y1)	movupd	12 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 6 * SIZE(Y1)	movhpd	 %xmm7, 7 * SIZE(Y1)	movupd	14 * SIZE(Y1), %xmm7	PREFETCH	PREFETCHSIZE * SIZE(Y1)	addpd	 %xmm12, %xmm4	movupd	24 * SIZE(A1), %xmm12	mulpd	 %xmm0, %xmm8	addpd	 %xmm13, %xmm5	movupd	26 * SIZE(A1), %xmm13	mulpd	 %xmm0, %xmm9	addpd	 %xmm14, %xmm6	movupd	28 * SIZE(A1), %xmm14	mulpd	 %xmm0, %xmm10	addpd	 %xmm15, %xmm7	movupd	30 * SIZE(A1), %xmm15	mulpd	 %xmm0, %xmm11	movlpd	 %xmm4,  8 * SIZE(Y1)	movhpd	 %xmm4,  9 * SIZE(Y1)	movupd	16 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 10 * SIZE(Y1)	movhpd	 %xmm5, 11 * SIZE(Y1)	movupd	18 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 12 * SIZE(Y1)	movhpd	 %xmm6, 13 * SIZE(Y1)	movupd	20 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 14 * SIZE(Y1)	movhpd	 %xmm7, 15 * SIZE(Y1)	movupd	22 * SIZE(Y1), %xmm7	subq	$-16 * SIZE, A1	subq	$-16 * SIZE, Y1	decq	 I	jg	.L31	ALIGN_3.L32:	addpd	 %xmm8,  %xmm4	mulpd	 %xmm0, %xmm12	addpd	 %xmm9,  %xmm5	mulpd	 %xmm0, %xmm13	addpd	 %xmm10, %xmm6	mulpd	 %xmm0, %xmm14	addpd	 %xmm11, %xmm7	mulpd	 %xmm0, %xmm15	movlpd	 %xmm4, 0 * SIZE(Y1)	movhpd	 %xmm4, 1 * SIZE(Y1)	movupd	 8 * SIZE(Y1), %xmm4	movlpd	 %xmm5, 2 * SIZE(Y1)	movhpd	 %xmm5, 3 * SIZE(Y1)	movupd	10 * SIZE(Y1), %xmm5	movlpd	 %xmm6, 4 * SIZE(Y1)	movhpd	 %xmm6, 5 * SIZE(Y1)	movupd	12 * SIZE(Y1), %xmm6	movlpd	 %xmm7, 6 * SIZE(Y1)	movhpd	 %xmm7, 7 * SIZE(Y1)	movupd	14 * SIZE(Y1), %xmm7	addpd	 %xmm12, %xmm4	addpd	 %xmm13, %xmm5	addpd	 %xmm14, %xmm6	addpd	 %xmm15, %xmm7	movlpd	 %xmm4,  8 * SIZE(Y1)	movhpd	 %xmm4,  9 * SIZE(Y1)	movlpd	 %xmm5, 10 * SIZE(Y1)	movhpd	 %xmm5, 11 * SIZE(Y1)	movlpd	 %xmm6, 12 * SIZE(Y1)	movhpd	 %xmm6, 13 * SIZE(Y1)	movlpd	 %xmm7, 14 * SIZE(Y1)	movhpd	 %xmm7, 15 * SIZE(Y1)	subq	$-16 * SIZE, A1	subq	$-16 * SIZE, A2	subq	$-16 * SIZE, Y1	ALIGN_3.L35:	testq	$8, M	je	.L36	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 4 * SIZE(A1), %xmm10	movupd	 6 * SIZE(A1), %xmm11	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	movupd	 4 * SIZE(Y1), %xmm6	movupd	 6 * SIZE(Y1), %xmm7	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	mulpd	 %xmm0, %xmm10	mulpd	 %xmm0, %xmm11	addpd	 %xmm8,  %xmm4	addpd	 %xmm9,  %xmm5	addpd	 %xmm10, %xmm6	addpd	 %xmm11, %xmm7	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	movlpd	 %xmm5,  2 * SIZE(Y1)	movhpd	 %xmm5,  3 * SIZE(Y1)	movlpd	 %xmm6,  4 * SIZE(Y1)	movhpd	 %xmm6,  5 * SIZE(Y1)	movlpd	 %xmm7,  6 * SIZE(Y1)	movhpd	 %xmm7,  7 * SIZE(Y1)	addq	 $8 * SIZE, A1	addq	 $8 * SIZE, Y1	ALIGN_3.L36:	testq	$4, M	je	.L37	movupd	 0 * SIZE(A1), %xmm8	movupd	 2 * SIZE(A1), %xmm9	movupd	 0 * SIZE(Y1), %xmm4	movupd	 2 * SIZE(Y1), %xmm5	mulpd	 %xmm0, %xmm8	mulpd	 %xmm0, %xmm9	addpd	 %xmm8,  %xmm4	addpd	 %xmm9,  %xmm5	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	movlpd	 %xmm5,  2 * SIZE(Y1)	movhpd	 %xmm5,  3 * SIZE(Y1)	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, Y1	ALIGN_3.L37:	testq	$2, M	je	.L38	movupd	 0 * SIZE(A1), %xmm8	movupd	 0 * SIZE(Y1), %xmm4	mulpd	 %xmm0, %xmm8	addpd	 %xmm8, %xmm4	movlpd	 %xmm4,  0 * SIZE(Y1)	movhpd	 %xmm4,  1 * SIZE(Y1)	addq	 $2 * SIZE, A1	addq	 $2 * SIZE, Y1	ALIGN_3.L38:	testq	$1, M	je	.L995	movlpd	 0 * SIZE(A1), %xmm8	movlpd	 0 * SIZE(Y1), %xmm4	mulsd	 %xmm0, %xmm8	addsd	 %xmm8,  %xmm4	movlpd	 %xmm4, 0 * SIZE(Y1)	ALIGN_3.L995:	cmpq	$SIZE, INCY	je	.L999	movq	Y,  Y1	movq	M,  %rax	sarq	$2, %rax	jle	.L997	ALIGN_3.L996:	movlpd	0 * SIZE(Y), %xmm4	addq	INCY, Y	movhpd	0 * SIZE(Y), %xmm4	addq	INCY, Y	movlpd	0 * SIZE(Y), %xmm5	addq	INCY, Y	movhpd	0 * SIZE(Y), %xmm5	addq	INCY, Y	movapd	0 * SIZE(YY), %xmm0	movapd	2 * SIZE(YY), %xmm1	addpd	%xmm4, %xmm0	addpd	%xmm5, %xmm1	movlpd	%xmm0, 0 * SIZE(Y1)	addq	INCY, Y1	movhpd	%xmm0, 0 * SIZE(Y1)	addq	INCY, Y1	movlpd	%xmm1, 0 * SIZE(Y1)	addq	INCY, Y1	movhpd	%xmm1, 0 * SIZE(Y1)	addq	INCY, Y1	addq	$4 * SIZE, YY	decq	%rax	jg	.L996	ALIGN_3.L997:	movq	M,  %rax	andq	$3, %rax	jle	.L999	ALIGN_3.L998:	movlpd	0 * SIZE(YY), %xmm0	addsd	0 * SIZE(Y), %xmm0	movlpd	%xmm0, 0 * SIZE(Y1)	addq	$SIZE, YY	addq	INCY, Y	addq	INCY, Y1	decq	%rax	jg	.L998	ALIGN_3.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15#ifdef WINDOWS_ABI	movq	 48(%rsp), %rdi	movq	 56(%rsp), %rsi	movups	 64(%rsp), %xmm6	movups	 80(%rsp), %xmm7	movups	 96(%rsp), %xmm8	movups	112(%rsp), %xmm9	movups	128(%rsp), %xmm10	movups	144(%rsp), %xmm11	movups	160(%rsp), %xmm12	movups	176(%rsp), %xmm13	movups	192(%rsp), %xmm14	movups	208(%rsp), %xmm15#endif	addq	$STACKSIZE, %rsp	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -