symv_l_sse.s
来自「Optimized GotoBLAS libraries」· S 代码 · 共 993 行 · 第 1/2 页
S
993 行
movsd 12 * SIZE(A2), a3 movhps 14 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN) PREFETCHW PREFETCHSIZE(YY)#endif movaps xtemp1, xt1 movaps 16 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 12 * SIZE(A2, LDA, 1), a4 movhps 14 * SIZE(A2, LDA, 1), a4 movlps yy1, 8 * SIZE(YY) movhps yy1, 10 * SIZE(YY) movsd 12 * SIZE(YY), yy1 movhps 14 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 16 * SIZE(A1), a1 movhps 18 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 16 * SIZE(A1, LDA, 1), a2 movhps 18 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 16 * SIZE(A2), a3 movhps 18 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 20 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 16 * SIZE(A2, LDA, 1), a4 movhps 18 * SIZE(A2, LDA, 1), a4 movlps yy1, 12 * SIZE(YY) movhps yy1, 14 * SIZE(YY) movsd 16 * SIZE(YY), yy1 movhps 18 * SIZE(YY), yy1 addq $16 * SIZE, XX addq $16 * SIZE, YY addq $16 * SIZE, A1 addq $16 * SIZE, A2 decq I jg .L12 ALIGN_3.L14: movq M, I subq IS, I subq $4, I test $8, I jle .L15 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3.L15: test $4, I jle .L17 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movaps xtemp1, xt1 movsd 4 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3.L17: testq $2, M jle .L18 pxor xtemp2, xtemp2 movlhps xtemp2, a1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movss 2 * SIZE(A1), a1 movlhps xtemp2, a2 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movss 2 * SIZE(A1, LDA, 1), a2 movlhps xtemp2, a3 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movss 2 * SIZE(A2), a3 movlhps xtemp2, a4 movaps xtemp1, xt1 movss 2 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movss 2 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movss 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 addq $2 * SIZE, A2 ALIGN_3.L18: testq $1, M jle .L19 movss 0 * SIZE(XX), xtemp1 movss 0 * SIZE(YY), yy1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 movss 0 * SIZE(A2), a3 movss 0 * SIZE(A2, LDA, 1), a4 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addss a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movaps xtemp1, xt1 mulss a3, xt1 mulss atemp3, a3 addss xt1, xsum3 addss a3, yy1 movaps xtemp1, xt1 mulss a4, xt1 mulss atemp4, a4 addss xt1, xsum4 addss a4, yy1 movss yy1, 0 * SIZE(YY) ALIGN_3.L19:#ifndef HAVE_SSE3 movaps xsum1, xtemp1 unpcklps xsum3, xsum1 unpckhps xsum3, xtemp1 movaps xsum2, xtemp2 unpcklps xsum4, xsum2 unpckhps xsum4, xtemp2 movaps xsum1, xsum3 unpcklps xsum2, xsum1 unpckhps xsum2, xsum3 movaps xtemp1, xsum4 unpcklps xtemp2, xtemp1 unpckhps xtemp2, xsum4 addps xsum3, xsum1 addps xtemp1, xsum4 addps xsum4, xsum1#else haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1#endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) addq $4, IS movq IS, I addq $4, I cmpq M, I jle .L11 ALIGN_3.L20: testq $2, M jle .L30 movq A, A1 leaq 2 * SIZE(A, LDA, 2), A movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4#if defined(OPTERON) pxor xsum1, xsum1#endif movsd 0 * SIZE(A1), xsum1 mulps atemp4, xsum1 movss 1 * SIZE(A1), xsum2 movss 1 * SIZE(A1, LDA, 1), a2 unpcklps a2, xsum2 mulps atemp4, xsum2 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 testq $1, M jle .L29 movss 2 * SIZE(A1), a1 movss 2 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addps a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) ALIGN_3.L29:#ifndef HAVE_SSE3 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1#else haddps xsum2, xsum1 haddps xsum1, xsum1#endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) addq $2, IS ALIGN_3.L30: testq $1, M jle .L990 movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 mulss 0 * SIZE(A), xsum1 addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) ALIGN_3.L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3.L996: movss 0 * SIZE(NEW_Y), %xmm0 movss 1 * SIZE(NEW_Y), %xmm1 movss 2 * SIZE(NEW_Y), %xmm2 movss 3 * SIZE(NEW_Y), %xmm3 movss 4 * SIZE(NEW_Y), %xmm4 movss 5 * SIZE(NEW_Y), %xmm5 movss 6 * SIZE(NEW_Y), %xmm6 movss 7 * SIZE(NEW_Y), %xmm7 movss %xmm0, 0 * SIZE(Y) addq INCY, Y movss %xmm1, 0 * SIZE(Y) addq INCY, Y movss %xmm2, 0 * SIZE(Y) addq INCY, Y movss %xmm3, 0 * SIZE(Y) addq INCY, Y movss %xmm4, 0 * SIZE(Y) addq INCY, Y movss %xmm5, 0 * SIZE(Y) addq INCY, Y movss %xmm6, 0 * SIZE(Y) addq INCY, Y movss %xmm7, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3.L998: movss 0 * SIZE(NEW_Y), %xmm0 movss %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?