⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_n_sse2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L106.L104:	KERNELMACRO7( 0)	KERNELMACRO7( 2)	KERNELMACRO7( 4)	KERNELMACRO7( 6)	KERNELMACRO7( 8)	KERNELMACRO7(10)	KERNELMACRO7(12)	KERNELMACRO7(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L104.L106:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L108.L107:	KERNELMACRO7( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L107.L108:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	mulpd	%xmm3, %xmm5	movl	%ecx, %edx	mulpd	%xmm3, %xmm6	addl	%eax, %edx	mulsd	%xmm3, %xmm7	cmpl	$8, %eax	jne	.L109	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	  2 * SIZE(%ecx), %xmm0	movhpd	  3 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm5	movsd	  4 * SIZE(%ecx), %xmm0	movhpd	  5 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm6	movsd	  6 * SIZE(%ecx), %xmm0	addsd	%xmm0, %xmm7	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	movsd	%xmm5, 2 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 3 * SIZE(%ecx)	movsd	%xmm6, 4 * SIZE(%ecx)	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 5 * SIZE(%ecx)	movsd	%xmm7, 6 * SIZE(%ecx)	jmp	.L99.L109:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	addpd	%xmm1, %xmm5	movsd	  0 * SIZE(%edx), %xmm2	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm2	addl	  %eax, %edx	addpd	%xmm2, %xmm6	movsd	  0 * SIZE(%edx), %xmm0	addsd	%xmm0, %xmm7	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm6, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm7, 0 * SIZE(%ecx)	jmp	.L99.L110:	cmpl	$6, %esi	jne	.L120.L111:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	movl	A, %edx		# aoffset = a	addl	$6 * SIZE, A	# a += 6	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L116.L114:	KERNELMACRO6( 0)	KERNELMACRO6( 2)	KERNELMACRO6( 4)	KERNELMACRO6( 6)	KERNELMACRO6( 8)	KERNELMACRO6(10)	KERNELMACRO6(12)	KERNELMACRO6(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L114.L116:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L118.L117:	KERNELMACRO6( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L117.L118:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	mulpd	%xmm3, %xmm5	movl	%ecx, %edx	mulpd	%xmm3, %xmm6	addl	%eax, %edx	cmpl	$8, %eax	jne	.L119	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	  2 * SIZE(%ecx), %xmm0	movhpd	  3 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm5	movsd	  4 * SIZE(%ecx), %xmm0	movhpd	  5 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm6	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	movsd	%xmm5, 2 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 3 * SIZE(%ecx)	movsd	%xmm6, 4 * SIZE(%ecx)	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 5 * SIZE(%ecx)	jmp	.L99.L119:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	addpd	%xmm1, %xmm5	movsd	  0 * SIZE(%edx), %xmm2	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm2	addpd	%xmm2, %xmm6	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm6, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 0 * SIZE(%ecx)	jmp	.L99.L120:	cmpl	$5, %esi	jne	.L130.L121:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	movl	A, %edx		# aoffset = a	addl	$5 * SIZE, A	# a += 5	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L126.L124:	KERNELMACRO5( 0)	KERNELMACRO5( 2)	KERNELMACRO5( 4)	KERNELMACRO5( 6)	KERNELMACRO5( 8)	KERNELMACRO5(10)	KERNELMACRO5(12)	KERNELMACRO5(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L124.L126:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L128.L127:	KERNELMACRO5( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L127.L128:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	mulpd	%xmm3, %xmm5	movl	%ecx, %edx	mulsd	%xmm3, %xmm6	addl	%eax, %edx	cmpl	$8, %eax	jne	.L129	movsd	  0 * SIZE(%ecx), %xmm0	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	  2 * SIZE(%ecx), %xmm0	movhpd	  3 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm5	movsd	  4 * SIZE(%ecx), %xmm0	addsd	%xmm0, %xmm6	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	movsd	%xmm5, 2 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 3 * SIZE(%ecx)	movsd	%xmm6, 4 * SIZE(%ecx)	jmp	.L99.L129:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	addpd	%xmm1, %xmm5	movsd	  0 * SIZE(%edx), %xmm2	addsd	%xmm2, %xmm6	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm6, 0 * SIZE(%ecx)	jmp	.L99.L130:	cmpl	$4, %esi	jne	.L140.L131:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	movl	A, %edx		# aoffset = a	addl	$4 * SIZE, A	# a += 5	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L136.L134:	KERNELMACRO4( 0)	KERNELMACRO4( 2)	KERNELMACRO4( 4)	KERNELMACRO4( 6)	KERNELMACRO4( 8)	KERNELMACRO4(10)	KERNELMACRO4(12)	KERNELMACRO4(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L134.L136:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L138.L137:	KERNELMACRO4( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L137.L138:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	mulpd	%xmm3, %xmm5	movl	%ecx, %edx	addl	%eax, %edx	cmpl	$8, %eax	jne	.L139	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	  2 * SIZE(%ecx), %xmm0	movhpd	  3 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm5	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	movsd	%xmm5, 2 * SIZE(%ecx)	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 3 * SIZE(%ecx)	jmp	.L99.L139:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	  0 * SIZE(%edx), %xmm1	addl	  %eax, %edx	movhpd	  0 * SIZE(%edx), %xmm1	addpd	%xmm1, %xmm5	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm5, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm5, %xmm5	movsd	%xmm5, 0 * SIZE(%ecx)	jmp	.L99.L140:	cmpl	$3, %esi	jne	.L150.L141:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	movl	A, %edx		# aoffset = a	addl	$3 * SIZE, A	# a += 5	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L146.L144:	KERNELMACRO3( 0)	KERNELMACRO3( 2)	KERNELMACRO3( 4)	KERNELMACRO3( 6)	KERNELMACRO3( 8)	KERNELMACRO3(10)	KERNELMACRO3(12)	KERNELMACRO3(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L144.L146:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L148.L147:	KERNELMACRO3( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L147.L148:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	mulsd	%xmm3, %xmm5	movl	%ecx, %edx	addl	%eax, %edx	cmpl	$8, %eax	jne	.L149	movsd	  0 * SIZE(%ecx), %xmm0	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	  2 * SIZE(%ecx), %xmm0	addsd	%xmm0, %xmm5	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	movsd	%xmm5, 2 * SIZE(%ecx)	jmp	.L99.L149:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	  0 * SIZE(%edx), %xmm1	addsd	%xmm1, %xmm5	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	movsd	%xmm5, 0 * SIZE(%ecx)	jmp	.L99.L150:	cmpl	$2, %esi	jne	.L160.L151:	pxor	%xmm4, %xmm4	movl	A, %edx		# aoffset = a	addl	$2 * SIZE, A	# a += 5	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L156.L154:	KERNELMACRO2( 0)	KERNELMACRO2( 2)	KERNELMACRO2( 4)	KERNELMACRO2( 6)	KERNELMACRO2( 8)	KERNELMACRO2(10)	KERNELMACRO2(12)	KERNELMACRO2(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L154.L156:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L158.L157:	KERNELMACRO2( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L157.L158:	movsd	  0 * SIZE(%ecx), %xmm0	mulpd	%xmm3, %xmm4	movl	INCY, %eax	movl	%ecx, %edx	addl	%eax, %edx	cmpl	$8, %eax	jne	.L159	movsd	  0 * SIZE(%ecx), %xmm0	movhpd	  1 * SIZE(%ecx), %xmm0	addpd	%xmm0, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 1 * SIZE(%ecx)	jmp	.L99.L159:	movhpd	  0 * SIZE(%edx), %xmm0	addl	  %eax, %edx	addpd	%xmm0, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	addl	  %eax, %ecx	unpckhpd %xmm4, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	jmp	.L99.L160:	pxor	%xmm4, %xmm4	movl	A, %edx		# aoffset = a	addl	$1 * SIZE, A	# a += 5	movl	BUFFER, %ebx	# boffset = buffer	movapd	  0 * SIZE(%ebx), %xmm0	movl	%edi, %eax	# i = min_n	sarl	$3, %eax	jle	.L166.L164:	KERNELMACRO1( 0)	KERNELMACRO1( 2)	KERNELMACRO1( 4)	KERNELMACRO1( 6)	KERNELMACRO1( 8)	KERNELMACRO1(10)	KERNELMACRO1(12)	KERNELMACRO1(14)	addl	$16 * SIZE, %ebx	decl	%eax	jg	.L164.L166:	movl	%edi, %eax	# i = min_n	andl	$7, %eax	jle	.L168.L167:	KERNELMACRO1( 0)	addl	$2 * SIZE, %ebx	decl	%eax	jg	.L167.L168:	mulsd	%xmm3, %xmm4	movsd	  0 * SIZE(%ecx), %xmm0	addsd	%xmm0, %xmm4	movsd	%xmm4, 0 * SIZE(%ecx)	jmp	.L99		EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -