⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_n_sse.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	mulps	%xmm3, %xmm4	movl	INCY, %eax	mulps	%xmm3, %xmm5	movl	%ecx, %edx	mulps	%xmm3, %xmm6	mulps	%xmm3, %xmm7	cmpl	$4, %eax	jne	.L29#ifdef HAVE_SSE2	movsd	  0 * SIZE(%ecx), %xmm0	movhps	  2 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movsd	  4 * SIZE(%ecx), %xmm1	movhps	  6 * SIZE(%ecx), %xmm1	addps	%xmm1, %xmm5	movsd	  8 * SIZE(%ecx), %xmm2	movhps	 10 * SIZE(%ecx), %xmm2	addps	%xmm2, %xmm6	movsd	 12 * SIZE(%ecx), %xmm0	movhps	 14 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm7	movsd	%xmm4,  0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(%ecx)	movsd	%xmm5,  4 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5,  6 * SIZE(%ecx)	movsd	%xmm6,  8 * SIZE(%ecx)	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 10 * SIZE(%ecx)	movsd	%xmm7, 12 * SIZE(%ecx)	unpckhpd %xmm7, %xmm7	movsd	%xmm7, 14 * SIZE(%ecx)#else		movlps	  0 * SIZE(%ecx), %xmm0	movhps	  2 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movlps	  4 * SIZE(%ecx), %xmm1	movhps	  6 * SIZE(%ecx), %xmm1	addps	%xmm1, %xmm5	movlps	  8 * SIZE(%ecx), %xmm2	movhps	 10 * SIZE(%ecx), %xmm2	addps	%xmm2, %xmm6	movlps	 12 * SIZE(%ecx), %xmm0	movhps	 14 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm7	movlps	%xmm4,  0 * SIZE(%ecx)	movhps	%xmm4,  2 * SIZE(%ecx)	movlps	%xmm5,  4 * SIZE(%ecx)	movhps	%xmm5,  6 * SIZE(%ecx)	movlps	%xmm6,  8 * SIZE(%ecx)	movhps	%xmm6, 10 * SIZE(%ecx)	movlps	%xmm7, 12 * SIZE(%ecx)	movhps	%xmm7, 14 * SIZE(%ecx)#endif	addl	$16 * SIZE, %ecx	decl	%esi	jg	.L21	movl	M,   %esi	andl	$15, %esi	jne	.L100	jmp	.L99.L29:	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(%ecx)	addl	%eax, %ecx	decl	%esi	jg	.L21	movl	M,  %esi	andl	$15, %esi	jne	.L100.L99:	movl	PLDA_M, %ebx	addl	%ebx, A	movl	N, %edi	movl	IS, %ecx	cmpl	%edi, %ecx	jl	.L01.L999:	movl	OLD_STACK, %esp	popl	%ebx	popl	%esi	popl	%edi		popl	%ebp	ret.L100:	movl	M,  %esi	testl	$8, %esi	jle	.L110.L101:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movl	A, %edx		# aoffset = a	addl	$8 * SIZE, A	# a += 8	movl	BUFFER, %ebx	# boffset = buffer	movaps	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$2, %eax	jle	.L106.L104:	KERNELMACRO8UNROLL( 0)	KERNELMACRO8UNROLL( 8)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L104.L106:	movl	%edi, %eax	# i = min_n	andl	$3, %eax	jle	.L108.L107:	KERNELMACRO8( 0)	addl	$4 * SIZE, %ebx	decl	%eax	jg	.L107.L108:	addps	%xmm6, %xmm4	addps	%xmm7, %xmm5	mulps	%xmm3, %xmm4	movl	INCY, %eax	mulps	%xmm3, %xmm5	movl	%ecx, %edx	cmpl	$4, %eax	jne	.L109#ifdef HAVE_SSE2	movsd	  0 * SIZE(%ecx), %xmm0	movhps	  2 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movsd	  4 * SIZE(%ecx), %xmm1	movhps	  6 * SIZE(%ecx), %xmm1	addps	%xmm1, %xmm5	movsd	%xmm4,  0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(%ecx)	movsd	%xmm5,  4 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5,  6 * SIZE(%ecx)#else	movlps	  0 * SIZE(%ecx), %xmm0	movhps	  2 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movlps	  4 * SIZE(%ecx), %xmm1	movhps	  6 * SIZE(%ecx), %xmm1	addps	%xmm1, %xmm5	movlps	%xmm4,  0 * SIZE(%ecx)	movhps	%xmm4,  2 * SIZE(%ecx)	movlps	%xmm5,  4 * SIZE(%ecx)	movhps	%xmm5,  6 * SIZE(%ecx)#endif	addl	$8 * SIZE, %ecx	jmp	.L110.L109:	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(%ecx)	addl	%eax, %ecx.L110:	testl	$4, %esi	jle	.L120.L111:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movl	A, %edx		# aoffset = a	addl	$4 * SIZE, A	# a += 8	movl	BUFFER, %ebx	# boffset = buffer	movaps	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$2, %eax	jle	.L116.L114:	KERNELMACRO4UNROLL( 0)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L114.L116:	movl	%edi, %eax	# i = min_n	andl	$3, %eax	jle	.L118.L117:	KERNELMACRO4( 0)	addl	$4 * SIZE, %ebx	decl	%eax	jg	.L117.L118:	addps	%xmm5, %xmm4	addps	%xmm7, %xmm6	addps	%xmm6, %xmm4	mulps	%xmm3, %xmm4	movl	INCY, %eax	movl	%ecx, %edx	cmpl	$4, %eax	jne	.L119#ifdef HAVE_SSE2	movsd	  0 * SIZE(%ecx), %xmm0	movhps	  2 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movsd	%xmm4,  0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(%ecx)#else	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	movss	  1 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  1 * SIZE(%ecx)	movss	  2 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  2 * SIZE(%ecx)	movss	  3 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  3 * SIZE(%ecx)#endif	addl	$4 * SIZE, %ecx	jmp	.L120.L119:	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx.L120:	testl	$2, %esi	jle	.L130.L121:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movl	A, %edx		# aoffset = a	addl	$2 * SIZE, A	# a += 8	movl	BUFFER, %ebx	# boffset = buffer	movaps	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$2, %eax	jle	.L126.L124:	KERNELMACRO2UNROLL( 0)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L124.L126:	movl	%edi, %eax	# i = min_n	andl	$3, %eax	jle	.L128.L127:	KERNELMACRO2( 0)	addl	$4 * SIZE, %ebx	decl	%eax	jg	.L127.L128:	addps	%xmm5, %xmm4	addps	%xmm7, %xmm6	addps	%xmm6, %xmm4	mulps	%xmm3, %xmm4	movl	INCY, %eax	movl	%ecx, %edx	cmpl	$4, %eax	jne	.L129#ifdef HAVE_SSE2	movsd	  0 * SIZE(%ecx), %xmm0	addps	%xmm0, %xmm4	movsd	%xmm4,  0 * SIZE(%ecx)#else	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	movss	  1 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  1 * SIZE(%ecx)#endif	addl	$2 * SIZE, %ecx	jmp	.L130.L129:	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx	movss	  0 * SIZE(%ecx), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	addl	%eax, %ecx.L130:	testl	$1, %esi	jle	.L99.L131:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movl	A, %edx		# aoffset = a	addl	$1 * SIZE, A	# a += 8	movl	BUFFER, %ebx	# boffset = buffer	movaps	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$2, %eax	jle	.L136.L134:	KERNELMACRO1UNROLL( 0)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L134.L136:	movl	%edi, %eax	# i = min_n	andl	$3, %eax	jle	.L138.L137:	KERNELMACRO1( 0)	addl	$4 * SIZE, %ebx	decl	%eax	jg	.L137.L138:	addss	%xmm5, %xmm4	addss	%xmm7, %xmm6	addss	%xmm6, %xmm4	mulss	%xmm3, %xmm4	movss	  0 * SIZE(%ecx), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(%ecx)	jmp	.L99	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -