📄 zaxpy_sse2_core2.s
字号:
mulpd %xmm15, %xmm5 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd %xmm8, 0 * SIZE(Y) movapd %xmm9, 2 * SIZE(Y) movapd %xmm10, 4 * SIZE(Y) movapd %xmm11, 6 * SIZE(Y) movsd 8 * SIZE(X), %xmm0 movhpd 9 * SIZE(X), %xmm0 movsd 10 * SIZE(X), %xmm2 movhpd 11 * SIZE(X), %xmm2 movsd 12 * SIZE(X), %xmm4 movhpd 13 * SIZE(X), %xmm4 movsd 14 * SIZE(X), %xmm6 movhpd 15 * SIZE(X), %xmm6 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 movapd 8 * SIZE(Y), %xmm8 movapd 10 * SIZE(Y), %xmm9 movapd 12 * SIZE(Y), %xmm10 movapd 14 * SIZE(Y), %xmm11 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm5 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd %xmm8, 8 * SIZE(Y) movapd %xmm9, 10 * SIZE(Y) movapd %xmm10, 12 * SIZE(Y) movapd %xmm11, 14 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3.L25: movq M, %rax andq $4, %rax jle .L26 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm2 movhpd 3 * SIZE(X), %xmm2 movsd 4 * SIZE(X), %xmm4 movhpd 5 * SIZE(X), %xmm4 movsd 6 * SIZE(X), %xmm6 movhpd 7 * SIZE(X), %xmm6 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 movapd 0 * SIZE(Y), %xmm8 movapd 2 * SIZE(Y), %xmm9 movapd 4 * SIZE(Y), %xmm10 movapd 6 * SIZE(Y), %xmm11 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm5 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd %xmm8, 0 * SIZE(Y) movapd %xmm9, 2 * SIZE(Y) movapd %xmm10, 4 * SIZE(Y) movapd %xmm11, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L26: movq M, %rax andq $2, %rax jle .L27 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm2 movhpd 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movapd 0 * SIZE(Y), %xmm8 movapd 2 * SIZE(Y), %xmm9 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 movapd %xmm8, 0 * SIZE(Y) movapd %xmm9, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L27: movq M, %rax andq $1, %rax jle .L999 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm1 movapd 0 * SIZE(Y), %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm8 movapd %xmm8, 0 * SIZE(Y) jmp .L999 ALIGN_3.L30: testq $SIZE, X jne .L40#ifndef CONJ movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 unpcklpd %xmm1, %xmm15#else movddup %xmm1, %xmm15 movapd %xmm0, %xmm14 pxor %xmm13, %xmm13 subsd %xmm0, %xmm13 unpcklpd %xmm13, %xmm14#endif movq Y, YY movq M, %rax sarq $3, %rax jle .L35 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X movapd (X), %xmm4 addq INCX, X movapd (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 decq %rax jle .L32 ALIGN_3.L31: addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X movapd (X), %xmm4 addq INCX, X movapd (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X movapd (X), %xmm4 addq INCX, X movapd (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y decq %rax jg .L31 ALIGN_3.L32: addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X movapd (X), %xmm4 addq INCX, X movapd (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY ALIGN_3.L35: movq M, %rax andq $4, %rax jle .L36 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X movapd (X), %xmm4 addq INCX, X movapd (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY ALIGN_3.L36: movq M, %rax andq $2, %rax jle .L37 movapd (X), %xmm0 addq INCX, X movapd (X), %xmm2 addq INCX, X pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY ALIGN_3.L37: movq M, %rax andq $1, %rax jle .L999 movapd (X), %xmm0 pshufd $0x4e, %xmm0, %xmm1 movapd (Y), %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm8 movapd %xmm8, (YY) jmp .L999 ALIGN_3.L40: movq Y, YY movq M, %rax sarq $3, %rax jle .L45 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 decq %rax jle .L42 ALIGN_3.L41: addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movapd %xmm8, (YY) addq INCY, YY movapd %xmm9, (YY) addq INCY, YY movapd %xmm10, (YY) addq INCY, YY movapd %xmm11, (YY) addq INCY, YY movapd (Y), %xmm8 addq INCY, Y movapd (Y), %xmm9 addq INCY, Y movapd (Y), %xmm10 addq INCY, Y movapd (Y), %xmm11 addq INCY, Y addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -