📄 gemv_n_sse.s
字号:
ALIGN_4.L28: mulps ALPHA, %xmm4 mulps ALPHA, %xmm5 mulps ALPHA, %xmm6 mulps ALPHA, %xmm7 cmpq $4, INCY jne .L29 movsd 0 * SIZE(CO), %xmm0 movhpd 2 * SIZE(CO), %xmm0 addps %xmm0, %xmm4 movsd 4 * SIZE(CO), %xmm1 movhpd 6 * SIZE(CO), %xmm1 addps %xmm1, %xmm5 movsd 8 * SIZE(CO), %xmm2 movhpd 10 * SIZE(CO), %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(CO), %xmm0 movhpd 14 * SIZE(CO), %xmm0 addps %xmm0, %xmm7 movsd %xmm4, 0 * SIZE(CO) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(CO) movsd %xmm5, 4 * SIZE(CO) unpckhpd %xmm5, %xmm5 movsd %xmm5, 6 * SIZE(CO) movsd %xmm6, 8 * SIZE(CO) unpckhpd %xmm6, %xmm6 movsd %xmm6, 10 * SIZE(CO) movsd %xmm7, 12 * SIZE(CO) unpckhpd %xmm7, %xmm7 movsd %xmm7, 14 * SIZE(CO) addq $16 * SIZE, CO decq J jg .L21 movq M, J andq $15, J jne .L100 jmp .L99 ALIGN_4.L29: movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(CO) addq INCY, CO decq J jg .L21 movq M, J andq $15, J jne .L100 ALIGN_4.L99: addq PLDA_M, A cmpq N, IS jl .L01 ALIGN_4.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret ALIGN_4.L100: movq M, J testq $8, J jle .L110 ALIGN_4.L101: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movq A, AO # aoffset = a addq $8 * SIZE, A # a += 8 movq BUFFER, BO # boffset = buffer movaps 0 * SIZE(BO), %xmm0 movq MIN_N, I # i = min_n sarq $2, I jle .L106 ALIGN_4.L104: KERNELMACRO8UNROLL( 0) KERNELMACRO8UNROLL( 8) addq $16 * SIZE, BO decq I jg .L104 ALIGN_4.L106: movq MIN_N, I # i = min_n andq $3, I jle .L108 ALIGN_4.L107: KERNELMACRO8( 0) addq $4 * SIZE, BO decq I jg .L107 ALIGN_4.L108: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps ALPHA, %xmm4 mulps ALPHA, %xmm5 cmpq $4, INCY jne .L109 movsd 0 * SIZE(CO), %xmm0 movhpd 2 * SIZE(CO), %xmm0 addps %xmm0, %xmm4 movsd 4 * SIZE(CO), %xmm1 movhpd 6 * SIZE(CO), %xmm1 addps %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(CO) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(CO) movsd %xmm5, 4 * SIZE(CO) unpckhpd %xmm5, %xmm5 movsd %xmm5, 6 * SIZE(CO) addq $8 * SIZE, CO jmp .L110 ALIGN_4.L109: movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(CO) addq INCY, CO ALIGN_4.L110: testq $4, J jle .L120 ALIGN_4.L111: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movq A, AO # aoffset = a addq $4 * SIZE, A # a += 8 movq BUFFER, BO # boffset = buffer movaps 0 * SIZE(BO), %xmm0 movq MIN_N, I # i = min_n sarq $2, I jle .L116 ALIGN_4.L114: KERNELMACRO4UNROLL( 0) addq $16 * SIZE, BO decq I jg .L114 ALIGN_4.L116: movq MIN_N, I # i = min_n andq $3, I jle .L118 ALIGN_4.L117: KERNELMACRO4( 0) addq $4 * SIZE, BO decq I jg .L117 ALIGN_4.L118: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps ALPHA, %xmm4 cmpq $4, INCY jne .L119 movsd 0 * SIZE(CO), %xmm0 movhpd 2 * SIZE(CO), %xmm0 addps %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(CO) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(CO) addq $4 * SIZE, CO jmp .L120 ALIGN_4.L119: movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO.L120: testq $2, J jle .L130 ALIGN_4.L121: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movq A, AO # aoffset = a addq $2 * SIZE, A # a += 8 movq BUFFER, BO # boffset = buffer movaps 0 * SIZE(BO), %xmm0 movq MIN_N, I # i = min_n sarq $2, I jle .L126 ALIGN_4.L124: KERNELMACRO2UNROLL( 0) addq $16 * SIZE, BO decq I jg .L124 ALIGN_4.L126: movq MIN_N, I # i = min_n andq $3, I jle .L128 ALIGN_4.L127: KERNELMACRO2( 0) addq $4 * SIZE, BO decq I jg .L127 ALIGN_4.L128: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps ALPHA, %xmm4 cmpq $4, INCY jne .L129 movsd 0 * SIZE(CO), %xmm0 addps %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(CO) addq $2 * SIZE, CO jmp .L130 ALIGN_4.L129: movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO movss 0 * SIZE(CO), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) addq INCY, CO ALIGN_4.L130: testq $1, J jle .L99 ALIGN_4.L131: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movq A, AO # aoffset = a addq $1 * SIZE, A # a += 8 movq BUFFER, BO # boffset = buffer movaps 0 * SIZE(BO), %xmm0 movq MIN_N, I # i = min_n sarq $2, I jle .L136 ALIGN_4.L134: KERNELMACRO1UNROLL( 0) addq $16 * SIZE, BO decq I jg .L134 ALIGN_4.L136: movq MIN_N, I # i = min_n andq $3, I jle .L138 ALIGN_4.L137: KERNELMACRO1( 0) addq $4 * SIZE, BO decq I jg .L137 ALIGN_4.L138: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 mulss ALPHA, %xmm4 movss 0 * SIZE(CO), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(CO) jmp .L99 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -