⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_t_sse.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	addss	%xmm2, %xmm5	movss	2 * SIZE(%ebp), %xmm1	movss	3 * SIZE(%ebp), %xmm2	addss	%xmm1, %xmm6	addss	%xmm2, %xmm7	movss	%xmm4, 0 * SIZE(%ebp)	movss	%xmm5, 1 * SIZE(%ebp)	movss	%xmm6, 2 * SIZE(%ebp)	movss	%xmm7, 3 * SIZE(%ebp)	addl	$4 * SIZE, %ebp	decl	J	jg	.L51 	movl	N, %esi	andl	$3, %esi	jne	.L100	jmp	.L99	ALIGN_3.L56:	movss	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movss	0 * SIZE(%edx), %xmm2	addl	%eax, %edx	addss	%xmm1, %xmm4	addss	%xmm2, %xmm5	movss	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movss	0 * SIZE(%edx), %xmm2	addss	%xmm1, %xmm6	addss	%xmm2, %xmm7	movss	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm5, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm6, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm7, 0 * SIZE(%ebp)	addl	%eax, %ebp	decl	J	jg	.L51 	movl	N, %esi	andl	$3, %esi	jne	.L100	ALIGN_3.L99:	movl	A, %ebx	addl	NLDA, %ebx	movl	%ebx, A	movl	IS, %esi	addl	$P, %esi	cmpl	M,  %esi	jl	.L10	ALIGN_3.L999:	movl	OLD_STACK, %esp	popl	%ebx	popl	%esi	popl	%edi	popl	%ebp	ret	ALIGN_3.L100:	movl	N, %esi	andl	$3, %esi	cmpl	$3, %esi	jne	.L110	ALIGN_3.L101:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 2), %eax	addl	%edx, %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	movaps	0 * SIZE(%esi), %xmm0	movaps	4 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L103	ALIGN_3.L102:	movsd	 0 * SIZE(%ebx), %xmm1	movhps	 2 * SIZE(%ebx), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm4	movsd	 0 * SIZE(%ecx), %xmm1	movhps	 2 * SIZE(%ecx), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm5	movsd	 0 * SIZE(%ebx, %edx, 2), %xmm1	movhps	 2 * SIZE(%ebx, %edx, 2), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm6	movaps	 8 * SIZE(%esi), %xmm0	movsd	 4 * SIZE(%ebx), %xmm1	movhps	 6 * SIZE(%ebx), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm4	movsd	 4 * SIZE(%ecx), %xmm1	movhps	 6 * SIZE(%ecx), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm5	movsd	 4 * SIZE(%ebx, %edx, 2), %xmm1	movhps	 6 * SIZE(%ebx, %edx, 2), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm6	movaps	12 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %ecx	addl	$8 * SIZE, %ebx	addl	$8 * SIZE, %esi	decl	%eax	jg	.L102	ALIGN_3.L103:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L105	ALIGN_3.L104:	movss	0 * SIZE(%ebx), %xmm1	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm4	movss	0 * SIZE(%ecx), %xmm1	addl	$SIZE, %ecx	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm5	movss	0 * SIZE(%ebx, %edx, 2), %xmm1	addl	$SIZE, %ebx	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm6	movss	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L104	ALIGN_3.L105:	movaps	%xmm4, %xmm0	shufps	$0xe, %xmm4, %xmm4	addps	 %xmm0, %xmm4		movaps	%xmm5, %xmm0	shufps	$0xe, %xmm5, %xmm5	addps	 %xmm0, %xmm5	movaps	%xmm6, %xmm0	shufps	$0xe, %xmm6, %xmm6	addps	 %xmm0, %xmm6	movaps	%xmm4, %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	 %xmm0, %xmm4	movaps	%xmm5, %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	 %xmm0, %xmm5	movaps	%xmm6, %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	 %xmm0, %xmm6	mulss	%xmm3, %xmm4	mulss	%xmm3, %xmm5	mulss	%xmm3, %xmm6	movl	INCY, %eax	movl	%ebp, %edx	cmpl	$SIZE, %eax	jne	.L106	movss	0 * SIZE(%ebp), %xmm1	movss	1 * SIZE(%ebp), %xmm2	addss	%xmm1, %xmm4	addss	%xmm2, %xmm5	movss	2 * SIZE(%ebp), %xmm1	addss	%xmm1, %xmm6	movss	%xmm4, 0 * SIZE(%ebp)	movss	%xmm5, 1 * SIZE(%ebp)	movss	%xmm6, 2 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L106:	movss	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movss	0 * SIZE(%edx), %xmm2	addl	%eax, %edx	addss	%xmm1, %xmm4	addss	%xmm2, %xmm5	movss	0 * SIZE(%edx), %xmm1	addss	%xmm1, %xmm6	movss	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm5, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm6, 0 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L110:	cmpl	$2, %esi	jne	.L120	ALIGN_3.L111:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 2), %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	movaps	0 * SIZE(%esi), %xmm0	movaps	4 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L113	ALIGN_3.L112:	movsd	 0 * SIZE(%ebx), %xmm1	movhps	 2 * SIZE(%ebx), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm4	movsd	 0 * SIZE(%ecx), %xmm1	movhps	 2 * SIZE(%ecx), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm5	movaps	 8 * SIZE(%esi), %xmm0	movsd	 4 * SIZE(%ebx), %xmm1	movhps	 6 * SIZE(%ebx), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm4	movsd	 4 * SIZE(%ecx), %xmm1	movhps	 6 * SIZE(%ecx), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm5	movaps	12 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %ebx	addl	$8 * SIZE, %ecx	addl	$8 * SIZE, %esi	decl	%eax	jg	.L112	ALIGN_3.L113:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L115	ALIGN_3.L114:	movss	0 * SIZE(%ebx), %xmm1	addl	$SIZE, %ebx	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm4	movss	0 * SIZE(%ecx), %xmm1	addl	$SIZE, %ecx	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm5	movss	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L114	ALIGN_3.L115:	movaps	%xmm4, %xmm0	shufps	$0xe, %xmm4, %xmm4	addps	 %xmm0, %xmm4	movaps	%xmm5, %xmm0	shufps	$0xe, %xmm5, %xmm5	addps	 %xmm0, %xmm5	movaps	%xmm4, %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	 %xmm0, %xmm4	movaps	%xmm5, %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	 %xmm0, %xmm5	mulss	%xmm3, %xmm4	mulss	%xmm3, %xmm5	movl	INCY, %eax	movl	%ebp, %edx	cmpl	$SIZE, %eax	jne	.L116	movss	0 * SIZE(%ebp), %xmm1	movss	1 * SIZE(%ebp), %xmm2	addss	%xmm1, %xmm4	addss	%xmm2, %xmm5	movss	%xmm4, 0 * SIZE(%ebp)	movss	%xmm5, 1 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L116:	movss	0 * SIZE(%edx), %xmm1	addl	%eax, %edx	movss	0 * SIZE(%edx), %xmm2	addss	%xmm1, %xmm4	addss	%xmm2, %xmm5	movss	%xmm4, 0 * SIZE(%ebp)	addl	%eax, %ebp	movss	%xmm5, 0 * SIZE(%ebp)	jmp	.L99	ALIGN_3.L120:	movl	A, %ebx				# a_offset = a	movl	LDA, %edx	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda	leal	(%ebx, %edx, 1), %eax	movl	%eax, A	movl	BUFFER, %esi	pxor	%xmm4, %xmm4	movaps	0 * SIZE(%esi), %xmm0	movaps	4 * SIZE(%esi), %xmm2	movl	MIN_M, %eax	sarl	$3,  %eax	jle	.L123	ALIGN_3.L122:	movsd	 0 * SIZE(%ebx), %xmm1	movhps	 2 * SIZE(%ebx), %xmm1	mulps	%xmm0, %xmm1	addps	%xmm1, %xmm4	movaps	 8 * SIZE(%esi), %xmm0	movsd	 4 * SIZE(%ebx), %xmm1	movhps	 6 * SIZE(%ebx), %xmm1	mulps	%xmm2, %xmm1	addps	%xmm1, %xmm4	movaps	12 * SIZE(%esi), %xmm2	addl	$8 * SIZE, %ebx	addl	$8 * SIZE, %esi	decl	%eax	jg	.L122	ALIGN_3.L123:	movl	MIN_M, %eax	andl	$7,  %eax	je	.L125	ALIGN_3.L124:	movss	0 * SIZE(%ebx), %xmm1	addl	$SIZE, %ebx	mulss	%xmm0, %xmm1	addss	%xmm1, %xmm4	movss	 1 * SIZE(%esi), %xmm0	addl	$SIZE, %esi	decl	%eax	jg	.L124	ALIGN_3.L125:	movaps	%xmm4, %xmm0	shufps	$0xe, %xmm4, %xmm4	addps	 %xmm0, %xmm4		movaps	%xmm4, %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	 %xmm0, %xmm4	mulss	%xmm3, %xmm4	movss	0 * SIZE(%ebp), %xmm1	addss	%xmm1, %xmm4	movss	%xmm4, 0 * SIZE(%ebp)	jmp	.L99	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -