⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_t_sse2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	addl	%eax, %edx	addsd	%xmm1, %xmm6	addsd	%xmm2, %xmm7	movsd	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm5, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm6, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm7, 0 * SIZE(%ebp)	addl	%eax, %ebp	decl	J	jg	.L51 	movl	N, %esi	andl	$3, %esi	jne	.L100	ALIGN_3.L99:	movl	A, %ebx	addl	NLDA, %ebx	movl	%ebx, A	movl	IS, %esi	addl	$P, %esi	cmpl	M,  %esi	jl	.L10	ALIGN_3.L999:	movl	OLD_STACK, %esp	popl	%ebx	popl	%esi	popl	%edi	popl	%ebp	ret	ALIGN_3.L100:	movl	N, %esi	andl	$3, %esi	cmpl	$3, %esi	jne	.L110	ALIGN_3.L101:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 2), %eax	addl	%edx, %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	movapd	0 * SIZE(%esi), %xmm0	movapd	2 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L103	ALIGN_3.L102:	movsd	0 * SIZE(%ebx), %xmm1	movhpd	1 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movsd	0 * SIZE(%ecx), %xmm1	movhpd	1 * SIZE(%ecx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm5	movsd	0 * SIZE(%ebx, %edx, 2), %xmm1	movhpd	1 * SIZE(%ebx, %edx, 2), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm6	movapd	4 * SIZE(%esi), %xmm0	movsd	2 * SIZE(%ebx), %xmm1	movhpd	3 * SIZE(%ebx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movsd	2 * SIZE(%ecx), %xmm1	movhpd	3 * SIZE(%ecx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm5	movsd	2 * SIZE(%ebx, %edx, 2), %xmm1	movhpd	3 * SIZE(%ebx, %edx, 2), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm6	movapd	6 * SIZE(%esi), %xmm2	movsd	4 * SIZE(%ebx), %xmm1	movhpd	5 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movsd	4 * SIZE(%ecx), %xmm1	movhpd	5 * SIZE(%ecx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm5	movsd	4 * SIZE(%ebx, %edx, 2), %xmm1	movhpd	5 * SIZE(%ebx, %edx, 2), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm6	movapd	8 * SIZE(%esi), %xmm0	movsd	6 * SIZE(%ebx), %xmm1	movhpd	7 * SIZE(%ebx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movsd	6 * SIZE(%ecx), %xmm1	movhpd	7 * SIZE(%ecx), %xmm1	addl	$8 * SIZE, %ecx	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm5	movsd	6 * SIZE(%ebx, %edx, 2), %xmm1	movhpd	7 * SIZE(%ebx, %edx, 2), %xmm1	addl	$8 * SIZE, %ebx	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm6	movapd	10 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %esi	decl	%eax	jg	.L102	ALIGN_3.L103:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L105	ALIGN_3.L104:	movsd	0 * SIZE(%ebx), %xmm1	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm4	movsd	0 * SIZE(%ecx), %xmm1	addl	$SIZE, %ecx	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm5	movsd	0 * SIZE(%ebx, %edx, 2), %xmm1	addl	$SIZE, %ebx	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm6	movsd	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L104	ALIGN_3.L105:	movapd	%xmm4, %xmm0	unpckhpd %xmm4, %xmm4	addsd	 %xmm0, %xmm4		movapd	%xmm5, %xmm0	unpckhpd %xmm5, %xmm5	addsd	 %xmm0, %xmm5	movapd	%xmm6, %xmm0	unpckhpd %xmm6, %xmm6	addsd	 %xmm0, %xmm6	mulsd	%xmm3, %xmm4	mulsd	%xmm3, %xmm5	mulsd	%xmm3, %xmm6	movl	INCY, %eax	movl	%ebp, %edx	cmpl	$SIZE, %eax	jne	.L106	movsd	0 * SIZE(%ebp), %xmm1	movsd	1 * SIZE(%ebp), %xmm2	addsd	%xmm1, %xmm4	addsd	%xmm2, %xmm5	movsd	2 * SIZE(%ebp), %xmm1	addsd	%xmm1, %xmm6	movsd	%xmm4, 0 * SIZE(%ebp)	movsd	%xmm5, 1 * SIZE(%ebp)	movsd	%xmm6, 2 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L106:	movsd	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movsd	0 * SIZE(%edx), %xmm2	addl	%eax, %edx	addsd	%xmm1, %xmm4	addsd	%xmm2, %xmm5	movsd	0 * SIZE(%edx), %xmm1	addsd	%xmm1, %xmm6	movsd	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm5, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm6, 0 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L110:	cmpl	$2, %esi	jne	.L120	ALIGN_3.L111:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 2), %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	movapd	0 * SIZE(%esi), %xmm0	movapd	2 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L113	ALIGN_3.L112:	movsd	0 * SIZE(%ebx), %xmm1	movhpd	1 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movsd	0 * SIZE(%ecx), %xmm1	movhpd	1 * SIZE(%ecx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm5	movapd	4 * SIZE(%esi), %xmm0	movsd	2 * SIZE(%ebx), %xmm1	movhpd	3 * SIZE(%ebx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movsd	2 * SIZE(%ecx), %xmm1	movhpd	3 * SIZE(%ecx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm5	movapd	6 * SIZE(%esi), %xmm2	movsd	4 * SIZE(%ebx), %xmm1	movhpd	5 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movsd	4 * SIZE(%ecx), %xmm1	movhpd	5 * SIZE(%ecx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm5	movapd	8 * SIZE(%esi), %xmm0	movsd	6 * SIZE(%ebx), %xmm1	movhpd	7 * SIZE(%ebx), %xmm1	addl	$8 * SIZE, %ebx	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movsd	6 * SIZE(%ecx), %xmm1	movhpd	7 * SIZE(%ecx), %xmm1	addl	$8 * SIZE, %ecx	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm5	movapd	10 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %esi	decl	%eax	jg	.L112	ALIGN_3.L113:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L115	ALIGN_3.L114:	movsd	0 * SIZE(%ebx), %xmm1	addl	$SIZE, %ebx	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm4	movsd	0 * SIZE(%ecx), %xmm1	addl	$SIZE, %ecx	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm5	movsd	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L114	ALIGN_3.L115:	movapd	%xmm4, %xmm0	unpckhpd %xmm4, %xmm4	addsd	 %xmm0, %xmm4		movapd	%xmm5, %xmm0	unpckhpd %xmm5, %xmm5	addsd	 %xmm0, %xmm5	mulsd	%xmm3, %xmm4	mulsd	%xmm3, %xmm5	movl	INCY, %eax	movl	%ebp, %edx	cmpl	$SIZE, %eax	jne	.L116	movsd	0 * SIZE(%ebp), %xmm1	movsd	1 * SIZE(%ebp), %xmm2	addsd	%xmm1, %xmm4	addsd	%xmm2, %xmm5	movsd	%xmm4, 0 * SIZE(%ebp)	movsd	%xmm5, 1 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L116:	movsd	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movsd	0 * SIZE(%edx), %xmm2	addsd	%xmm1, %xmm4	addsd	%xmm2, %xmm5	movsd	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movsd	%xmm5, 0 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L120:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 1), %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	movapd	0 * SIZE(%esi), %xmm0	movapd	2 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L123	ALIGN_3.L122:	movsd	0 * SIZE(%ebx), %xmm1	movhpd	1 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movapd	4 * SIZE(%esi), %xmm0	movsd	2 * SIZE(%ebx), %xmm1	movhpd	3 * SIZE(%ebx), %xmm1	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movapd	6 * SIZE(%esi), %xmm2	movsd	4 * SIZE(%ebx), %xmm1	movhpd	5 * SIZE(%ebx), %xmm1	mulpd	%xmm0, %xmm1	addpd	%xmm1, %xmm4	movapd	8 * SIZE(%esi), %xmm0	movsd	6 * SIZE(%ebx), %xmm1	movhpd	7 * SIZE(%ebx), %xmm1	addl	$8 * SIZE, %ebx	mulpd	%xmm2, %xmm1	addpd	%xmm1, %xmm4	movapd	10 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %esi	decl	%eax	jg	.L122	ALIGN_3.L123:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L125	ALIGN_3.L124:	movsd	0 * SIZE(%ebx), %xmm1	addl	$SIZE, %ebx	mulsd	%xmm0, %xmm1	addsd	%xmm1, %xmm4	movsd	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L124	ALIGN_3.L125:	movapd	%xmm4, %xmm0	unpckhpd %xmm4, %xmm4	addsd	 %xmm0, %xmm4		mulsd	%xmm3, %xmm4	movsd	0 * SIZE(%ebp), %xmm1	addsd	%xmm1, %xmm4	movsd	%xmm4, 0 * SIZE(%ebp)	jmp	.L99	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -