📄 gemm_kernel_2x4_barcelona.s
字号:
leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4.L30: testl $2, N je .L60 ALIGN_2.L31:#if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK#endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L50 ALIGN_4.L41:#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO#else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 prefetchw 1 * SIZE(CO) pxor %xmm5, %xmm5 prefetchw 1 * SIZE(CO, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7#ifndef TRMMKERNEL movl K, %eax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax#ifdef LEFT addl $2, %eax#else addl $2, %eax#endif movl %eax, KKK#endif sarl $3, %eax je .L45 ALIGN_4.L42: prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -14 * SIZE(BO), %xmm0 movddup -13 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -14 * SIZE(BO), %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -12 * SIZE(BO), %xmm0 movddup -11 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -12 * SIZE(BO), %xmm1 movddup -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -10 * SIZE(BO), %xmm1 movddup -8 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(BO), %xmm0 movddup -7 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -8 * SIZE(BO), %xmm1 movddup -6 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -6 * SIZE(BO), %xmm1 movddup -4 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -4 * SIZE(BO), %xmm0 movddup -3 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -4 * SIZE(BO), %xmm1 movddup -2 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -2 * SIZE(BO), %xmm0 movddup -1 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -2 * SIZE(BO), %xmm1 movddup 0 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 subl $-16 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L42 ALIGN_4.L45:#ifndef TRMMKERNEL movl K, %eax#else movl KKK, %eax#endif movddup ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3.L46: mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 addl $2 * SIZE, AO addl $2 * SIZE, BO decl %eax jg .L46 ALIGN_4.L48:#ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 movsd 1 * SIZE(CO), %xmm1 movhpd 1 * SIZE(CO, LDC), %xmm1#endif addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5#ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm5#endif movlpd %xmm4, 0 * SIZE(CO) movlpd %xmm5, 1 * SIZE(CO) movhpd %xmm4, 0 * SIZE(CO, LDC) movhpd %xmm5, 1 * SIZE(CO, LDC)#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO#endif#if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK#endif addl $2 * SIZE, CO # coffset += 2 decl I # i -- jg .L41 ALIGN_4.L50: movl M, I testl $1, I # i = (m >> 2) jle .L59#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO#else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 2), BO#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7#ifndef TRMMKERNEL movl K, %eax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax#ifdef LEFT addl $1, %eax#else addl $2, %eax#endif movl %eax, KKK#endif sarl $3, %eax je .L55 ALIGN_4.L52: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(AO), %xmm0 mulpd -8 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(AO), %xmm0 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(AO), %xmm0 mulpd -4 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(AO), %xmm0 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(AO), %xmm0 subl $ -8 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L52 ALIGN_4.L55: movddup ALPHA, %xmm3#ifndef TRMMKERNEL movl K, %eax#else movl KKK, %eax#endif andl $7, %eax # if (k & 1) BRANCH je .L58.L56: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 subl $-1 * SIZE, AO subl $-2 * SIZE, BO decl %eax jg .L56 ALIGN_4.L58: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 addpd %xmm5, %xmm4 mulpd %xmm3, %xmm4#ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 addpd %xmm0, %xmm4#endif movlpd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 0 * SIZE(CO, LDC, 1)#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO#endif#if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK#endif ALIGN_4.L59:#if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK#endif movl BO, B leal (, LDC, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4.L60: testl $1, N je .L999#if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK#endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L80 ALIGN_4.L71:#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO#else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 1), BO#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 prefetchw 1 * SIZE(CO)#ifndef TRMMKERNEL movl K, %eax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax#ifdef LEFT addl $2, %eax#else addl $1, %eax#endif movl %eax, KKK#endif sarl $3, %eax je .L75 ALIGN_4.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 mulpd -14 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(BO), %xmm0 mulpd -12 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(BO), %xmm0 mulpd -10 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(BO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(BO), %xmm0 mulpd -6 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(BO), %xmm0 mulpd -4 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(BO), %xmm0 mulpd -2 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(BO), %xmm0 subl $-16 * SIZE, AO subl $ -8 * SIZE, BO decl %eax jne .L72 ALIGN_4.L75: movddup ALPHA, %xmm3#ifndef TRMMKERNEL movl K, %eax#else movl KKK, %eax#endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3.L76: mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 addl $2 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L76 ALIGN_4.L78: mulpd %xmm3, %xmm4#ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 1 * SIZE(CO), %xmm0 addpd %xmm0, %xmm4#endif movsd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 1 * SIZE(CO)#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO#endif#if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK#endif addl $2 * SIZE, CO # coffset += 2 decl I # i -- jg .L71 ALIGN_4.L80: movl M, I testl $1, I # i = (m >> 2) jle .L999#if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO#else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7#ifndef TRMMKERNEL movl K, %eax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax#ifdef LEFT addl $1, %eax#else addl $1, %eax#endif movl %eax, KKK#endif sarl $3, %eax je .L85 ALIGN_4.L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movapd -14 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm5 movapd -12 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm6 movapd -10 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm7 movapd -8 * SIZE(AO), %xmm0 subl $-8 * SIZE, AO subl $-8 * SIZE, BO decl %eax jne .L82 ALIGN_4.L85: movddup ALPHA, %xmm3#ifndef TRMMKERNEL movl K, %eax#else movl KKK, %eax#endif andl $7, %eax # if (k & 1) BRANCH je .L88.L86: mulsd -16 * SIZE(BO), %xmm0 addsd %xmm0, %xmm4 movsd -15 * SIZE(AO), %xmm0 addl $1 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L86 ALIGN_4.L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 mulsd %xmm3, %xmm4#ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 addsd %xmm0, %xmm4#endif movsd %xmm4, 0 * SIZE(CO) ALIGN_4.L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -