📄 zsymv_l_sse2.s
字号:
movapd 14 * SIZE(XX), xtemp4 addq $8 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $2, I sarq $2, I jle .L15 ALIGN_3.L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) PREFETCH PREFETCHSIZE(A1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(XX) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(6 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(5 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(7 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(6 * SIZE, A1, a1) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(4 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(7 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3.L15: movq M, I subq IS, I subq $2, I testq $2, I jle .L16 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3.L16: testq $1, M jle .L18 MOVDDUP(1 * SIZE, A1, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(0 * SIZE, A2, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum1 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3.L18: leaq (, IS, SIZE), I movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 movsd 2 * SIZE(NEW_Y, I, 2), yy2 movhpd 3 * SIZE(NEW_Y, I, 2), yy2 addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) movlpd yy2, 2 * SIZE(NEW_Y, I, 2) movhpd yy2, 3 * SIZE(NEW_Y, I, 2) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3.L20: testq $1, M jle .L990 leaq (, IS, SIZE), I movapd 0 * SIZE(NEW_X, I, 4), atemp1 movapd 2 * SIZE(NEW_X, I, 4), atemp2 movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1#ifndef HEMV MOVDDUP(0 * SIZE, A, a1) MOVDDUP(1 * SIZE, A, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, yy1 addpd a2, yy1#else MOVDDUP(0 * SIZE, A, a1) mulpd atemp1, a1 addpd a1, yy1#endif movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) ALIGN_3.L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3.L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3.L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -