📄 zaxpy_sse2.s
字号:
addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 4 * SIZE(Y) movhpd %xmm4, 5 * SIZE(Y) movsd %xmm5, 6 * SIZE(Y) movhpd %xmm5, 7 * SIZE(Y)#ifdef OPTERON prefetcht0 (PREFETCHSIZE + 8) * SIZE(X) prefetchw (PREFETCHSIZE + 8) * SIZE(Y)#endif movapd 8 * SIZE(X), %xmm0 movapd 10 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 8 * SIZE(Y), %xmm4 movhpd 9 * SIZE(Y), %xmm4 movsd 10 * SIZE(Y), %xmm5 movhpd 11 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 8 * SIZE(Y) movhpd %xmm4, 9 * SIZE(Y) movsd %xmm5, 10 * SIZE(Y) movhpd %xmm5, 11 * SIZE(Y) movapd 12 * SIZE(X), %xmm0 movapd 14 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 12 * SIZE(Y), %xmm4 movhpd 13 * SIZE(Y), %xmm4 movsd 14 * SIZE(Y), %xmm5 movhpd 15 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 12 * SIZE(Y) movhpd %xmm4, 13 * SIZE(Y) movsd %xmm5, 14 * SIZE(Y) movhpd %xmm5, 15 * SIZE(Y)#ifdef PENTIUM4 prefetcht0 (PREFETCHSIZE + 0) * SIZE(Y)#endif addl $16 * SIZE, X addl $16 * SIZE, Y decl %eax jg .L32 ALIGN_3.L35: testl $4, M jle .L36 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) movsd %xmm5, 2 * SIZE(Y) movhpd %xmm5, 3 * SIZE(Y) movapd 4 * SIZE(X), %xmm0 movapd 6 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 4 * SIZE(Y), %xmm4 movhpd 5 * SIZE(Y), %xmm4 movsd 6 * SIZE(Y), %xmm5 movhpd 7 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 4 * SIZE(Y) movhpd %xmm4, 5 * SIZE(Y) movsd %xmm5, 6 * SIZE(Y) movhpd %xmm5, 7 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3.L36: testl $2, M jle .L37 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) movsd %xmm5, 2 * SIZE(Y) movhpd %xmm5, 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3.L37: testl $1, M jle .L99 movapd 0 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm1 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) jmp .L99 ALIGN_3.L40: movl M, %eax sarl $3, %eax jle .L45 ALIGN_3.L42:#ifdef OPTERON prefetcht0 (PREFETCHSIZE + 0) * SIZE(X) prefetchw (PREFETCHSIZE + 0) * SIZE(Y)#endif movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm2 movhpd 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) movsd %xmm5, 2 * SIZE(Y) movhpd %xmm5, 3 * SIZE(Y) movsd 4 * SIZE(X), %xmm0 movhpd 5 * SIZE(X), %xmm0 movsd 6 * SIZE(X), %xmm2 movhpd 7 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 4 * SIZE(Y), %xmm4 movhpd 5 * SIZE(Y), %xmm4 movsd 6 * SIZE(Y), %xmm5 movhpd 7 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 4 * SIZE(Y) movhpd %xmm4, 5 * SIZE(Y) movsd %xmm5, 6 * SIZE(Y) movhpd %xmm5, 7 * SIZE(Y)#ifdef OPTERON prefetcht0 (PREFETCHSIZE + 8) * SIZE(X) prefetchw (PREFETCHSIZE + 8) * SIZE(Y)#endif movsd 8 * SIZE(X), %xmm0 movhpd 9 * SIZE(X), %xmm0 movsd 10 * SIZE(X), %xmm2 movhpd 11 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 8 * SIZE(Y), %xmm4 movhpd 9 * SIZE(Y), %xmm4 movsd 10 * SIZE(Y), %xmm5 movhpd 11 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 8 * SIZE(Y) movhpd %xmm4, 9 * SIZE(Y) movsd %xmm5, 10 * SIZE(Y) movhpd %xmm5, 11 * SIZE(Y) movsd 12 * SIZE(X), %xmm0 movhpd 13 * SIZE(X), %xmm0 movsd 14 * SIZE(X), %xmm2 movhpd 15 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 12 * SIZE(Y), %xmm4 movhpd 13 * SIZE(Y), %xmm4 movsd 14 * SIZE(Y), %xmm5 movhpd 15 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 12 * SIZE(Y) movhpd %xmm4, 13 * SIZE(Y) movsd %xmm5, 14 * SIZE(Y) movhpd %xmm5, 15 * SIZE(Y)#ifdef PENTIUM4 prefetcht0 (PREFETCHSIZE + 0) * SIZE(Y)#endif addl $16 * SIZE, X addl $16 * SIZE, Y decl %eax jg .L42 ALIGN_3.L45: testl $4, M jle .L46 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm2 movhpd 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) movsd %xmm5, 2 * SIZE(Y) movhpd %xmm5, 3 * SIZE(Y) movsd 4 * SIZE(X), %xmm0 movhpd 5 * SIZE(X), %xmm0 movsd 6 * SIZE(X), %xmm2 movhpd 7 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 4 * SIZE(Y), %xmm4 movhpd 5 * SIZE(Y), %xmm4 movsd 6 * SIZE(Y), %xmm5 movhpd 7 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 4 * SIZE(Y) movhpd %xmm4, 5 * SIZE(Y) movsd %xmm5, 6 * SIZE(Y) movhpd %xmm5, 7 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3.L46: testl $2, M jle .L47 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm2 movhpd 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) movsd %xmm5, 2 * SIZE(Y) movhpd %xmm5, 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3.L47: testl $1, M jle .L99 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm1 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 movsd %xmm4, 0 * SIZE(Y) movhpd %xmm4, 1 * SIZE(Y) jmp .L99 ALIGN_3.L50: movl Y, YY movl M, %eax sarl $2, %eax jle .L55 ALIGN_3.L52: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movsd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movsd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY decl %eax jg .L52 ALIGN_3.L55: testl $2, M jle .L57 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 addpd %xmm2, %xmm5 addpd %xmm3, %xmm5 movsd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movsd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY ALIGN_3.L57: testl $1, M jle .L99 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm1 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 movsd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) jmp .L99 ALIGN_3.L99: xorl %eax,%eax popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -