📄 symv_u_sse2.s
字号:
mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy2 movsd 4 * SIZE(A2, LDA, 1), a3 movhpd 5 * SIZE(A2, LDA, 1), a3 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 6 * SIZE(A2, LDA, 1), a1 movhpd 7 * SIZE(A2, LDA, 1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy2 movsd 10 * SIZE(A1), a2 movhpd 11 * SIZE(A1), a2 movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a3, xt1 mulpd atemp4, a3 addpd xt1, xsum4 addpd a3, yy1 movsd 8 * SIZE(A1, LDA, 1), a3 movhpd 9 * SIZE(A1, LDA, 1), a3 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy2 movsd 8 * SIZE(A1), a1 movhpd 9 * SIZE(A1), a1 movsd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 movsd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 decq I jg .L12 ALIGN_3.L15: testq $4, IS jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3.L18: unpckhpd atemp2, atemp1 unpckhpd atemp4, atemp3 movsd 0 * SIZE(A1), a1 movhpd 0 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum2 movsd 0 * SIZE(A2), a1 movhpd 1 * SIZE(A2), a1 mulpd atemp1, a1 addpd a1, xsum3 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum4 movsd 0 * SIZE(A2), a1 movhpd 0 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum1 movsd 1 * SIZE(A2), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum2 movsd 2 * SIZE(A2), a1 movhpd 2 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum3 movsd 2 * SIZE(A2, LDA, 1), a1 movhpd 3 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum4#ifndef HAVE_SSE3 movapd xsum1, atemp1 movapd xsum3, atemp3 unpcklpd xsum2, xsum1 unpcklpd xsum4, xsum3 unpckhpd xsum2, atemp1 unpckhpd xsum4, atemp3 addpd atemp1, xsum1 addpd atemp3, xsum3#else haddpd xsum2, xsum1 haddpd xsum4, xsum3#endif addpd xsum1, yy1 addpd xsum3, yy2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) addq $4, IS movq IS, I addq $4, I cmpq M, I jle .L11 ALIGN_3.L20: testq $2, M je .L30 ALIGN_3.L21: movq A, A1 leaq (A, LDA, 2), A#ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2#else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2#endif pxor xsum1, xsum1 pxor xsum2, xsum2 movapd 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 0 * SIZE(A1), a1 movhpd 1 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhpd 1 * SIZE(A1, LDA, 1), a2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L28 ALIGN_3.L22: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1), a1 movhpd 3 * SIZE(A1), a1 movapd xtemp1, xt1 movapd 2 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 2 * SIZE(A1, LDA, 1), a2 movhpd 3 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 2 * SIZE(YY), yy1 movhpd 3 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 decq I jg .L22 ALIGN_3.L28: unpckhpd atemp2, atemp1 movsd 0 * SIZE(A1), a1 movhpd 0 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum2#ifndef HAVE_SSE3 movapd xsum1, atemp1 unpcklpd xsum2, xsum1 unpckhpd xsum2, atemp1 addpd atemp1, xsum1#else haddpd xsum2, xsum1#endif addpd xsum1, yy1 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) addq $2, IS ALIGN_3.L30: testq $1, M je .L990 ALIGN_3.L31: movq A, A1#ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1#else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1#endif pxor xsum1, xsum1 movsd 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(NEW_Y), yy1 movsd 0 * SIZE(A1), a1 movq NEW_X, XX movq NEW_Y, YY movq IS, I testq I, I jle .L38 ALIGN_3.L32: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 1 * SIZE(A1), a1 movsd 1 * SIZE(XX), xtemp1 movsd yy1, 0 * SIZE(YY) movsd 1 * SIZE(YY), yy1 addq $1 * SIZE, XX addq $1 * SIZE, YY addq $1 * SIZE, A1 decq I jg .L32 ALIGN_3.L38: movsd 0 * SIZE(A1), a1 mulsd atemp1, a1 addsd a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 mulsd atemp1, a1 addsd a1, xsum2 addsd xsum1, yy1 movsd yy1, 0 * SIZE(YY) ALIGN_3.L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3.L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3.L998: movsd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -