📄 dot_sse_core2.s
字号:
movsd -32 * SIZE(Y), %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3.L27: testq $1, N jle .L999 ALIGN_3 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3.L30: testq $1 * SIZE, X jne .L40 movaps -34 * SIZE(X), %xmm4 movq N, %rax sarq $5, %rax jle .L33 ALIGN_4.L31: movaps -30 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 movaps -26 * SIZE(X), %xmm6 SHUFPD_1 %xmm6, %xmm5 movaps -22 * SIZE(X), %xmm7 SHUFPD_1 %xmm7, %xmm6 movaps -18 * SIZE(X), %xmm8 SHUFPD_1 %xmm8, %xmm7 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -14 * SIZE(X), %xmm9 SHUFPD_1 %xmm9, %xmm8 movaps -10 * SIZE(X), %xmm10 SHUFPD_1 %xmm10, %xmm9 movaps -6 * SIZE(X), %xmm11 SHUFPD_1 %xmm11, %xmm10 movaps -2 * SIZE(X), %xmm4 subq $-32 * SIZE, X SHUFPD_1 %xmm4, %xmm11 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 mulps -4 * SIZE(Y), %xmm11 subq $-32 * SIZE, Y addps %xmm11, %xmm3 subq $1, %rax jg,pt .L31 ALIGN_3.L33: testq $16, N jle .L34 ALIGN_3 movaps -30 * SIZE(X), %xmm5 movaps -26 * SIZE(X), %xmm6 movaps -22 * SIZE(X), %xmm7 movaps -18 * SIZE(X), %xmm8 SHUFPD_1 %xmm5, %xmm4 SHUFPD_1 %xmm6, %xmm5 SHUFPD_1 %xmm7, %xmm6 SHUFPD_1 %xmm8, %xmm7 mulps -32 * SIZE(Y), %xmm4 mulps -28 * SIZE(Y), %xmm5 mulps -24 * SIZE(Y), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm4, %xmm0 addps %xmm5, %xmm1 addps %xmm6, %xmm2 addps %xmm7, %xmm3 movaps %xmm8, %xmm4 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3.L34: testq $8, N jle .L35 ALIGN_3 movaps -30 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 movaps -26 * SIZE(X), %xmm6 SHUFPD_1 %xmm6, %xmm5 mulps -32 * SIZE(Y), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L35: testq $4, N jle .L36 ALIGN_3 movaps -30 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 movaps %xmm5, %xmm4 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L36: testq $2, N jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3.L37: testq $1, N jle .L999 ALIGN_3 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3.L40: movaps -35 * SIZE(X), %xmm4 movq N, %rax sarq $5, %rax jle .L43 movaps -31 * SIZE(X), %xmm5 pxor %xmm12, %xmm12 movaps -27 * SIZE(X), %xmm6 pxor %xmm13, %xmm13 movaps -23 * SIZE(X), %xmm7 pxor %xmm14, %xmm14 movaps -19 * SIZE(X), %xmm8 pxor %xmm15, %xmm15 decq %rax jle .L42 ALIGN_3.L41: addps %xmm12, %xmm0 movaps -15 * SIZE(X), %xmm12 addps %xmm13, %xmm1 movaps -11 * SIZE(X), %xmm13 addps %xmm14, %xmm2 movaps -7 * SIZE(X), %xmm14 addps %xmm15, %xmm3 movaps -3 * SIZE(X), %xmm15 movaps %xmm5, %xmm9 palignr $12, %xmm4, %xmm5 mulps -32 * SIZE(Y), %xmm5 movaps %xmm6, %xmm10 palignr $12, %xmm9, %xmm6 mulps -28 * SIZE(Y), %xmm6 movaps %xmm7, %xmm11 palignr $12, %xmm10, %xmm7 mulps -24 * SIZE(Y), %xmm7 movaps %xmm8, %xmm4 palignr $12, %xmm11, %xmm8 mulps -20 * SIZE(Y), %xmm8 addps %xmm5, %xmm0 movaps 1 * SIZE(X), %xmm5 addps %xmm6, %xmm1 movaps 5 * SIZE(X), %xmm6 addps %xmm7, %xmm2 movaps 9 * SIZE(X), %xmm7 addps %xmm8, %xmm3 movaps 13 * SIZE(X), %xmm8 movaps %xmm12, %xmm9 palignr $12, %xmm4, %xmm12 mulps -16 * SIZE(Y), %xmm12 movaps %xmm13, %xmm10 palignr $12, %xmm9, %xmm13 mulps -12 * SIZE(Y), %xmm13 movaps %xmm14, %xmm11 palignr $12, %xmm10, %xmm14 mulps -8 * SIZE(Y), %xmm14 subq $-32 * SIZE, X movaps %xmm15, %xmm4 palignr $12, %xmm11, %xmm15 mulps -4 * SIZE(Y), %xmm15 subq $-32 * SIZE, Y subq $1, %rax jg,pt .L41 ALIGN_3.L42: addps %xmm12, %xmm0 movaps -15 * SIZE(X), %xmm12 addps %xmm13, %xmm1 movaps -11 * SIZE(X), %xmm13 addps %xmm14, %xmm2 movaps -7 * SIZE(X), %xmm14 addps %xmm15, %xmm3 movaps -3 * SIZE(X), %xmm15 movaps %xmm5, %xmm9 palignr $12, %xmm4, %xmm5 movaps %xmm6, %xmm10 palignr $12, %xmm9, %xmm6 movaps %xmm7, %xmm11 palignr $12, %xmm10, %xmm7 movaps %xmm8, %xmm4 palignr $12, %xmm11, %xmm8 mulps -32 * SIZE(Y), %xmm5 mulps -28 * SIZE(Y), %xmm6 mulps -24 * SIZE(Y), %xmm7 mulps -20 * SIZE(Y), %xmm8 addps %xmm5, %xmm0 addps %xmm6, %xmm1 addps %xmm7, %xmm2 addps %xmm8, %xmm3 movaps %xmm12, %xmm9 palignr $12, %xmm4, %xmm12 movaps %xmm13, %xmm10 palignr $12, %xmm9, %xmm13 movaps %xmm14, %xmm11 palignr $12, %xmm10, %xmm14 movaps %xmm15, %xmm4 palignr $12, %xmm11, %xmm15 mulps -16 * SIZE(Y), %xmm12 mulps -12 * SIZE(Y), %xmm13 mulps -8 * SIZE(Y), %xmm14 mulps -4 * SIZE(Y), %xmm15 addps %xmm12, %xmm0 addps %xmm13, %xmm1 addps %xmm14, %xmm2 addps %xmm15, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3.L43: testq $16, N jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm5 movaps -27 * SIZE(X), %xmm6 movaps -23 * SIZE(X), %xmm7 movaps -19 * SIZE(X), %xmm8 movaps %xmm5, %xmm9 movaps %xmm6, %xmm10 palignr $12, %xmm4, %xmm5 palignr $12, %xmm9, %xmm6 movaps %xmm7, %xmm11 movaps %xmm8, %xmm4 palignr $12, %xmm10, %xmm7 palignr $12, %xmm11, %xmm8 mulps -32 * SIZE(Y), %xmm5 mulps -28 * SIZE(Y), %xmm6 mulps -24 * SIZE(Y), %xmm7 mulps -20 * SIZE(Y), %xmm8 addps %xmm5, %xmm0 addps %xmm6, %xmm1 addps %xmm7, %xmm2 addps %xmm8, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3.L44: testq $8, N jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm5 movaps -27 * SIZE(X), %xmm6 movaps %xmm5, %xmm7 movaps %xmm6, %xmm8 palignr $12, %xmm4, %xmm5 palignr $12, %xmm7, %xmm6 movaps %xmm8, %xmm4 mulps -32 * SIZE(Y), %xmm5 mulps -28 * SIZE(Y), %xmm6 addps %xmm5, %xmm0 addps %xmm6, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L45: testq $4, N jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm5 palignr $12, %xmm4, %xmm5 mulps -32 * SIZE(Y), %xmm5 addps %xmm5, %xmm2 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L46: testq $2, N jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3.L47: testq $1, N jle .L999 ALIGN_3 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3.L50:#ifdef F_INTERFACE testq INCX, INCX jge .L51 movq N, %rax decq %rax imulq INCX, %rax subq %rax, X ALIGN_3.L51: testq INCY, INCY jge .L52 movq N, %rax decq %rax imulq INCY, %rax subq %rax, Y ALIGN_3.L52:#endif movq N, %rax sarq $2, %rax jle .L55 ALIGN_3.L53: movss 0 * SIZE(X), %xmm4 addq INCX, X mulss 0 * SIZE(Y), %xmm4 addq INCY, Y movss 0 * SIZE(X), %xmm5 addq INCX, X mulss 0 * SIZE(Y), %xmm5 addq INCY, Y movss 0 * SIZE(X), %xmm6 addq INCX, X mulss 0 * SIZE(Y), %xmm6 addq INCY, Y movss 0 * SIZE(X), %xmm7 addq INCX, X mulss 0 * SIZE(Y), %xmm7 addq INCY, Y addss %xmm4, %xmm0 addss %xmm5, %xmm1 addss %xmm6, %xmm2 addss %xmm7, %xmm3 decq %rax jg .L53 ALIGN_3.L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3.L56: movss 0 * SIZE(X), %xmm4 addq INCX, X mulss 0 * SIZE(Y), %xmm4 addq INCY, Y addss %xmm4, %xmm0 decq %rax jg .L56 ALIGN_3.L999: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0#ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0#else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0#endif#if !defined(DOUBLE) && defined(F_INTERFACE) && defined(NEED_F2CCONV) cvtss2sd %xmm0, %xmm0#endif RESTOREREGISTERS ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -