📄 symv_u_sse.s
字号:
addps a3, yy1 movsd 16 * SIZE(A2), a3 movhps 18 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 20 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 16 * SIZE(A2, LDA, 1), a4 movhps 18 * SIZE(A2, LDA, 1), a4 movlps yy1, 12 * SIZE(YY) movhps yy1, 14 * SIZE(YY) movsd 16 * SIZE(YY), yy1 movhps 18 * SIZE(YY), yy1 addq $16 * SIZE, XX addq $16 * SIZE, YY addq $16 * SIZE, A1 addq $16 * SIZE, A2 decq I jg .L12 ALIGN_3.L14: testq $8, IS jle .L15 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3.L15: testq $4, IS jle .L18 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movaps xtemp1, xt1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3.L18: movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 movss 0 * SIZE(A2), a3 movss 0 * SIZE(A2, LDA, 1), a4 unpcklps a3, a1 unpcklps a4, a2 unpcklps a2, a1 mulps atemp1, a1 addps a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movss 1 * SIZE(A2), a2 movhps 1 * SIZE(A2, LDA, 1), a2 shufps $0x84, a2, a1 mulps atemp1, a1 addps a1, xsum2 movsd 0 * SIZE(A2), a1 movss 2 * SIZE(A2), a2 movhps 2 * SIZE(A2, LDA, 1), a2 shufps $0x84, a2, a1 mulps atemp1, a1 addps a1, xsum3 movsd 0 * SIZE(A2, LDA, 1), a1 movhps 2 * SIZE(A2, LDA, 1), a1 mulps atemp1, a1 addps a1, xsum4#ifndef HAVE_SSE3 movaps xsum1, xtemp1 unpcklps xsum3, xsum1 unpckhps xsum3, xtemp1 movaps xsum2, xtemp2 unpcklps xsum4, xsum2 unpckhps xsum4, xtemp2 movaps xsum1, xsum3 unpcklps xsum2, xsum1 unpckhps xsum2, xsum3 movaps xtemp1, xsum4 unpcklps xtemp2, xtemp1 unpckhps xtemp2, xsum4 addps xsum3, xsum1 addps xtemp1, xsum4 addps xsum4, xsum1#else haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1#endif addps xsum1, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) addq $4, IS movq IS, I addq $4, I cmpq M, I jle .L11 ALIGN_3.L20: testq $2, M jle .L30 movq A, A1 leaq (A, LDA, 2), A movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pxor xsum1, xsum1 pxor xsum2, xsum2 movaps 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(A1), a1 movhps 2 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhps 2 * SIZE(A1, LDA, 1), a2 movsd 0 * SIZE(NEW_Y), yy1 movhps 2 * SIZE(NEW_Y), yy1 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $2, I jle .L28 ALIGN_3.L22: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 movaps 4 * SIZE(XX), xtemp1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 decq I jg .L22 ALIGN_3.L28: movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 unpcklps a2, a1 mulps atemp1, a1 addps a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 mulps atemp1, a1 addps a1, xsum2#ifndef HAVE_SSE3 movhlps xsum1, xsum3 movhlps xsum2, xsum4 addps xsum3, xsum1 addps xsum4, xsum2 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1#else haddps xsum2, xsum1 haddps xsum1, xsum1#endif addps xsum1, yy1 movlps yy1, 0 * SIZE(YY) addq $2, IS ALIGN_3.L30: testq $1, M jle .L990 movq A, A1 movss 0 * SIZE(NEW_X, IS, SIZE), atemp1 pshufd $0x00, atemp1, atemp1 pxor xsum1, xsum1 pxor xsum2, xsum2 movss 0 * SIZE(NEW_Y), yy1 movss 0 * SIZE(NEW_X), xtemp1 movss 1 * SIZE(NEW_X), xtemp2 movss 0 * SIZE(A1), a1 movss 1 * SIZE(A1), a2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L38 ALIGN_3.L32: movaps xtemp1, xt1 movss 2 * SIZE(XX), xtemp1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movss 2 * SIZE(A1), a1 movss yy1, 0 * SIZE(YY) movss 1 * SIZE(YY), yy1 movaps xtemp2, xt1 movss 3 * SIZE(XX), xtemp2 mulps a2, xt1 mulps atemp1, a2 addps xt1, xsum1 addps a2, yy1 movss 3 * SIZE(A1), a2 movss yy1, 1 * SIZE(YY) movss 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 decq I jg .L32 ALIGN_3.L38: movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 mulss atemp1, a1 addss a1, xsum1#ifndef HAVE_SSE3 movhlps xsum1, xsum3 movhlps xsum2, xsum4 addps xsum3, xsum1 addps xsum4, xsum2 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1#else addss xsum2, xsum1#endif addss xsum1, yy1 movss yy1, 0 * SIZE(YY) addq $2, IS ALIGN_3.L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3.L996: movss 0 * SIZE(NEW_Y), %xmm0 movss 1 * SIZE(NEW_Y), %xmm1 movss 2 * SIZE(NEW_Y), %xmm2 movss 3 * SIZE(NEW_Y), %xmm3 movss 4 * SIZE(NEW_Y), %xmm4 movss 5 * SIZE(NEW_Y), %xmm5 movss 6 * SIZE(NEW_Y), %xmm6 movss 7 * SIZE(NEW_Y), %xmm7 movss %xmm0, 0 * SIZE(Y) addq INCY, Y movss %xmm1, 0 * SIZE(Y) addq INCY, Y movss %xmm2, 0 * SIZE(Y) addq INCY, Y movss %xmm3, 0 * SIZE(Y) addq INCY, Y movss %xmm4, 0 * SIZE(Y) addq INCY, Y movss %xmm5, 0 * SIZE(Y) addq INCY, Y movss %xmm6, 0 * SIZE(Y) addq INCY, Y movss %xmm7, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3.L998: movss 0 * SIZE(NEW_Y), %xmm0 movss %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -