📄 gemm_kernel_8x4_sse3.s
字号:
KERNEL4 (64 * 4) KERNEL5 (64 * 4) KERNEL6 (64 * 4) KERNEL7 (64 * 4) KERNEL8 (64 * 4) KERNEL9 (64 * 4) KERNEL10(64 * 4) KERNEL11(64 * 4) KERNEL12(64 * 4) KERNEL13(64 * 4) KERNEL14(64 * 4) KERNEL15(64 * 4) KERNEL16(64 * 4) cmpq $128 * 5, %rax jle,pn .L12 KERNEL1 (64 * 5) KERNEL2 (64 * 5) KERNEL3 (64 * 5) KERNEL4 (64 * 5) KERNEL5 (64 * 5) KERNEL6 (64 * 5) KERNEL7 (64 * 5) KERNEL8 (64 * 5) KERNEL9 (64 * 5) KERNEL10(64 * 5) KERNEL11(64 * 5) KERNEL12(64 * 5) KERNEL13(64 * 5) KERNEL14(64 * 5) KERNEL15(64 * 5) KERNEL16(64 * 5) cmpq $128 * 6, %rax jle,pn .L12 KERNEL1 (64 * 6) KERNEL2 (64 * 6) KERNEL3 (64 * 6) KERNEL4 (64 * 6) KERNEL5 (64 * 6) KERNEL6 (64 * 6) KERNEL7 (64 * 6) KERNEL8 (64 * 6) KERNEL9 (64 * 6) KERNEL10(64 * 6) KERNEL11(64 * 6) KERNEL12(64 * 6) KERNEL13(64 * 6) KERNEL14(64 * 6) KERNEL15(64 * 6) KERNEL16(64 * 6) cmpq $128 * 7, %rax jle,pn .L12 KERNEL1 (64 * 7) KERNEL2 (64 * 7) KERNEL3 (64 * 7) KERNEL4 (64 * 7) KERNEL5 (64 * 7) KERNEL6 (64 * 7) KERNEL7 (64 * 7) KERNEL8 (64 * 7) KERNEL9 (64 * 7) KERNEL10(64 * 7) KERNEL11(64 * 7) KERNEL12(64 * 7) KERNEL13(64 * 7) KERNEL14(64 * 7) KERNEL15(64 * 7) KERNEL16(64 * 7) addq $64 * 8 * SIZE, AO addq $64 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X.L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO#else sarq $3, %rax je .L15 ALIGN_4.L12: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) KERNEL4 (64 * 0) KERNEL5 (64 * 0) KERNEL6 (64 * 0) KERNEL7 (64 * 0) KERNEL8 (64 * 0) KERNEL9 (64 * 0) KERNEL10(64 * 0) KERNEL11(64 * 0) KERNEL12(64 * 0) KERNEL13(64 * 0) KERNEL14(64 * 0) KERNEL15(64 * 0) KERNEL16(64 * 0) addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L12#endif ALIGN_4.L15:#ifndef TRMMKERNEL movq K, %rax#else movq KKK, %rax#endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4.L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm6 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm7 movsldup 8 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L16 ALIGN_4.L18:#if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 mulps %xmm15, %xmm0 movhps 2 * SIZE(CO1), %xmm8 mulps %xmm15, %xmm1 movsd 4 * SIZE(CO1), %xmm9 mulps %xmm15, %xmm2 movhps 6 * SIZE(CO1), %xmm9 mulps %xmm15, %xmm3 movsd 0 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm4 movhps 2 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm5 movsd 4 * SIZE(CO2), %xmm11 mulps %xmm15, %xmm6 movhps 6 * SIZE(CO2), %xmm11 mulps %xmm15, %xmm7 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 4 * SIZE(CO1, LDC, 2), %xmm13 movhps 6 * SIZE(CO1, LDC, 2), %xmm13 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 movsd 4 * SIZE(CO2, LDC, 2), %xmm15 movhps 6 * SIZE(CO2, LDC, 2), %xmm15 addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 addps %xmm12, %xmm2 movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) addps %xmm13, %xmm6 movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) addps %xmm14, %xmm3 movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) addps %xmm15, %xmm7 movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2)#else mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 mulps %xmm15, %xmm4 mulps %xmm15, %xmm5 mulps %xmm15, %xmm6 mulps %xmm15, %xmm7 movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2)#endif movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm6, 4 * SIZE(CO1, LDC, 2) movhps %xmm6, 6 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2) movsd %xmm7, 4 * SIZE(CO2, LDC, 2) movhps %xmm7, 6 * SIZE(CO2, LDC, 2)#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO#endif#if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK#endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO#else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3#ifndef TRMMKERNEL movq K, %rax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax#ifdef LEFT addq $4, %rax#else addq $4, %rax#endif movq %rax, KKK#endif sarq $3, %rax je .L25 ALIGN_4.L22: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 32 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 32 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 24 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 48 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 28 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 48 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 112 * SIZE(BO), %xmm15 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4.L25:#ifndef TRMMKERNEL movq K, %rax#else movq KKK, %rax#endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4.L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L26 ALIGN_4.L28:#if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3#else mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3#endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2)#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO#endif#if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK#endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO#else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO#endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 32 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3#ifndef TRMMKERNEL movq K, %rax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax#ifdef LEFT addq $2, %rax#else addq $4, %rax#endif movq %rax, KKK#endif sarq $3, %rax je .L35 ALIGN_4.L32: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 20 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 24 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 28 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movsd 64 * SIZE(BO), %xmm9 addps %xmm11, %xmm0 movsd 36 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 40 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 44 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 48 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 52 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -