📄 zsymv_u_sse2.s
字号:
mulpd atemp3, a3 addpd xt1, xsum2 addpd a3, yy1 MOVDDUP(4 * SIZE, A1, a3) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(6 * SIZE, A2, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A1, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 MOVDDUP(7 * SIZE, A2, a3) movapd xtemp3, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy2 MOVDDUP(6 * SIZE, A1, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum1 addpd a2, yy1 MOVDDUP(4 * SIZE, A2, a2) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a3, xt1 mulpd atemp4, a3 ADD xt1, xsum2 addpd a3, yy2 MOVDDUP(7 * SIZE, A1, a3) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(5 * SIZE, A2, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a3, xt1 mulpd atemp2, a3 ADD xt1, xsum1 addpd a3, yy2 MOVDDUP( 9 * SIZE, A1, a3) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 ADD xt1, xsum2 addpd a1, yy1 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3.L15: testq $2, IS jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3.L18: MOVDDUP(0 * SIZE, A1, a1) MOVDDUP(0 * SIZE, A2, a2) mulpd atemp1, a1 mulpd atemp1, a2 addpd a1, xsum1 addpd a2, xsum2#ifndef HEMV MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(1 * SIZE, A2, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2#else MOVDDUP(1 * SIZE, A2, a2) mulpd atemp2, a2 subpd a2, xsum2#endif MOVDDUP(0 * SIZE, A2, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2#ifndef HEMV MOVDDUP(1 * SIZE, A2, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2#else MOVDDUP(1 * SIZE, A2, a1) mulpd atemp4, a1 addpd a1, xsum1#endif addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3.L20: testq $1, M jle .L990 movq A, A1 leaq (, IS, 4), I movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 pxor xsum1, xsum1 pxor xsum2, xsum2 MOVDDUP(0 * SIZE, A1, a1) MOVDDUP(1 * SIZE, A1, a2) movapd 0 * SIZE(NEW_X), xtemp1 movapd 2 * SIZE(NEW_X), xtemp2 movapd 4 * SIZE(NEW_X), xtemp3 movapd 6 * SIZE(NEW_X), xtemp4 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 2 * SIZE(NEW_Y), yy2 movhpd 3 * SIZE(NEW_Y), yy2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L28 ALIGN_3.L22: movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(3 * SIZE, A1, a2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(5 * SIZE, A1, a2) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 decq I jg .L22 ALIGN_3.L28: MOVDDUP(0 * SIZE, A1, a1)#ifndef HEMV MOVDDUP(1 * SIZE, A1, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2#else mulpd atemp1, a1 addpd a1, xsum1#endif addpd xsum2, xsum1 addpd xsum1, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3.L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3.L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3.L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -