📄 gemv_n_sse.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
	ALIGN_4.L28:	mulps	ALPHA, %xmm4	mulps	ALPHA, %xmm5	mulps	ALPHA, %xmm6	mulps	ALPHA, %xmm7	cmpq	$4, INCY	jne	.L29	movsd	  0 * SIZE(CO), %xmm0	movhpd	  2 * SIZE(CO), %xmm0	addps	%xmm0, %xmm4	movsd	  4 * SIZE(CO), %xmm1	movhpd	  6 * SIZE(CO), %xmm1	addps	%xmm1, %xmm5	movsd	  8 * SIZE(CO), %xmm2	movhpd	 10 * SIZE(CO), %xmm2	addps	%xmm2, %xmm6	movsd	 12 * SIZE(CO), %xmm0	movhpd	 14 * SIZE(CO), %xmm0	addps	%xmm0, %xmm7	movsd	%xmm4,  0 * SIZE(CO)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(CO)	movsd	%xmm5,  4 * SIZE(CO)	unpckhpd %xmm5, %xmm5	movsd	%xmm5,  6 * SIZE(CO)	movsd	%xmm6,  8 * SIZE(CO)	unpckhpd %xmm6, %xmm6	movsd	%xmm6, 10 * SIZE(CO)	movsd	%xmm7, 12 * SIZE(CO)	unpckhpd %xmm7, %xmm7	movsd	%xmm7, 14 * SIZE(CO)	addq	$16 * SIZE, CO	decq	J	jg	.L21	movq	M,   J	andq	$15, J	jne	.L100	jmp	.L99	ALIGN_4.L29:	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm6, %xmm6	addss	%xmm0, %xmm6	movss	%xmm6,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm7, %xmm7	addss	%xmm0, %xmm7	movss	%xmm7,  0 * SIZE(CO)	addq	INCY, CO	decq	J	jg	.L21	movq	M,  J	andq	$15, J	jne	.L100	ALIGN_4.L99:	addq	PLDA_M, A	cmpq	N, IS	jl	.L01	ALIGN_4.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15#ifdef WINDOWS_ABI	movq	 48(%rsp), %rdi	movq	 56(%rsp), %rsi	movups	 64(%rsp), %xmm6	movups	 80(%rsp), %xmm7	movups	 96(%rsp), %xmm8	movups	112(%rsp), %xmm9	movups	128(%rsp), %xmm10	movups	144(%rsp), %xmm11	movups	160(%rsp), %xmm12	movups	176(%rsp), %xmm13	movups	192(%rsp), %xmm14	movups	208(%rsp), %xmm15#endif	addq	$STACKSIZE, %rsp	ret	ALIGN_4.L100:	movq	M,  J	testq	$8, J	jle	.L110	ALIGN_4.L101:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movq	A, AO		# aoffset = a	addq	$8 * SIZE, A	# a += 8	movq	BUFFER, BO	# boffset = buffer	movaps	  0 * SIZE(BO), %xmm0	movq	MIN_N, I	# i = min_n	sarq	$2, I	jle	.L106	ALIGN_4.L104:	KERNELMACRO8UNROLL( 0)	KERNELMACRO8UNROLL( 8)	addq	$16 * SIZE, BO	decq	I	jg	.L104	ALIGN_4.L106:	movq	MIN_N, I	# i = min_n	andq	$3, I	jle	.L108	ALIGN_4.L107:	KERNELMACRO8( 0)	addq	$4 * SIZE, BO	decq	I	jg	.L107	ALIGN_4.L108:	addps	%xmm6, %xmm4	addps	%xmm7, %xmm5	mulps	ALPHA, %xmm4	mulps	ALPHA, %xmm5	cmpq	$4, INCY	jne	.L109	movsd	  0 * SIZE(CO), %xmm0	movhpd	  2 * SIZE(CO), %xmm0	addps	%xmm0, %xmm4	movsd	  4 * SIZE(CO), %xmm1	movhpd	  6 * SIZE(CO), %xmm1	addps	%xmm1, %xmm5	movsd	%xmm4,  0 * SIZE(CO)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(CO)	movsd	%xmm5,  4 * SIZE(CO)	unpckhpd %xmm5, %xmm5	movsd	%xmm5,  6 * SIZE(CO)	addq	$8 * SIZE, CO	jmp	.L110	ALIGN_4.L109:	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm5, %xmm5	addss	%xmm0, %xmm5	movss	%xmm5,  0 * SIZE(CO)	addq	INCY, CO	ALIGN_4.L110:	testq	$4, J	jle	.L120	ALIGN_4.L111:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movq	A, AO		# aoffset = a	addq	$4 * SIZE, A	# a += 8	movq	BUFFER, BO	# boffset = buffer	movaps	  0 * SIZE(BO), %xmm0	movq	MIN_N, I	# i = min_n	sarq	$2, I	jle	.L116	ALIGN_4.L114:	KERNELMACRO4UNROLL( 0)	addq	$16 * SIZE, BO	decq	I	jg	.L114	ALIGN_4.L116:	movq	MIN_N, I	# i = min_n	andq	$3, I	jle	.L118	ALIGN_4.L117:	KERNELMACRO4( 0)	addq	$4 * SIZE, BO	decq	I	jg	.L117	ALIGN_4.L118:	addps	%xmm5, %xmm4	addps	%xmm7, %xmm6	addps	%xmm6, %xmm4	mulps	ALPHA, %xmm4	cmpq	$4, INCY	jne	.L119	movsd	  0 * SIZE(CO), %xmm0	movhpd	  2 * SIZE(CO), %xmm0	addps	%xmm0, %xmm4	movsd	%xmm4,  0 * SIZE(CO)	unpckhpd %xmm4, %xmm4	movsd	%xmm4,  2 * SIZE(CO)	addq	$4 * SIZE, CO	jmp	.L120	ALIGN_4.L119:	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO.L120:	testq	$2, J	jle	.L130	ALIGN_4.L121:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movq	A, AO		# aoffset = a	addq	$2 * SIZE, A	# a += 8	movq	BUFFER, BO	# boffset = buffer	movaps	  0 * SIZE(BO), %xmm0	movq	MIN_N, I	# i = min_n	sarq	$2, I	jle	.L126	ALIGN_4.L124:	KERNELMACRO2UNROLL( 0)	addq	$16 * SIZE, BO	decq	I	jg	.L124	ALIGN_4.L126:	movq	MIN_N, I	# i = min_n	andq	$3, I	jle	.L128	ALIGN_4.L127:	KERNELMACRO2( 0)	addq	$4 * SIZE, BO	decq	I	jg	.L127	ALIGN_4.L128:	addps	%xmm5, %xmm4	addps	%xmm7, %xmm6	addps	%xmm6, %xmm4	mulps	ALPHA, %xmm4	cmpq	$4, INCY	jne	.L129	movsd	  0 * SIZE(CO), %xmm0	addps	%xmm0, %xmm4	movsd	%xmm4,  0 * SIZE(CO)	addq	$2 * SIZE, CO	jmp	.L130	ALIGN_4.L129:	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	movss	  0 * SIZE(CO), %xmm0	shufps	$0x39, %xmm4, %xmm4	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	addq	INCY, CO	ALIGN_4.L130:	testq	$1, J	jle	.L99	ALIGN_4.L131:	pxor	%xmm4, %xmm4	pxor	%xmm5, %xmm5	pxor	%xmm6, %xmm6	pxor	%xmm7, %xmm7	movq	A, AO		# aoffset = a	addq	$1 * SIZE, A	# a += 8	movq	BUFFER, BO	# boffset = buffer	movaps	  0 * SIZE(BO), %xmm0	movq	MIN_N, I	# i = min_n	sarq	$2, I	jle	.L136	ALIGN_4.L134:	KERNELMACRO1UNROLL( 0)	addq	$16 * SIZE, BO	decq	I	jg	.L134	ALIGN_4.L136:	movq	MIN_N, I	# i = min_n	andq	$3, I	jle	.L138	ALIGN_4.L137:	KERNELMACRO1( 0)	addq	$4 * SIZE, BO	decq	I	jg	.L137	ALIGN_4.L138:	addss	%xmm5, %xmm4	addss	%xmm7, %xmm6	addss	%xmm6, %xmm4	mulss	ALPHA, %xmm4	movss	  0 * SIZE(CO), %xmm0	addss	%xmm0, %xmm4	movss	%xmm4,  0 * SIZE(CO)	jmp	.L99	EPILOGUE
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -