📄 gemm_kernel.s
字号:
FMUL c15, ALPHA, c15 add C3, 4 * SIZE, C3 FADD c03, a3, c03 LDF [C4 + 2 * SIZE], a3 FMUL c16, ALPHA, c16 nop FADD c04, a4, c04 LDF [C4 + 3 * SIZE], a4 STF c01, [C1 - 4 * SIZE] FADD c05, b1, c05 STF c02, [C1 - 3 * SIZE] FADD c06, b2, c06 STF c03, [C1 - 2 * SIZE] FADD c07, b3, c07 STF c04, [C1 - 1 * SIZE] FADD c08, b4, c08 STF c05, [C2 - 4 * SIZE] FADD c09, t1, c09 STF c06, [C2 - 3 * SIZE] FADD c10, t2, c10 STF c07, [C2 - 2 * SIZE] FADD c11, t3, c11 STF c08, [C2 - 1 * SIZE] FADD c12, t4, c12 STF c09, [C3 - 4 * SIZE] FADD c13, a1, c13 STF c10, [C3 - 3 * SIZE] FADD c14, a2, c14 STF c11, [C3 - 2 * SIZE] FADD c15, a3, c15 STF c12, [C3 - 1 * SIZE] FADD c16, a4, c16 STF c13, [C4 + 0 * SIZE] FMOV FZERO, t1 STF c14, [C4 + 1 * SIZE] FMOV FZERO, t2 STF c15, [C4 + 2 * SIZE] FMOV FZERO, t3 STF c16, [C4 + 3 * SIZE] FMOV FZERO, t4 add C4, 4 * SIZE, C4#else FADD c04, t1, c04 FMUL c01, ALPHA, c01 FADD c08, t2, c08 FMUL c02, ALPHA, c02 FADD c12, t3, c12 FMUL c03, ALPHA, c03 FADD c16, t4, c16 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] FMUL c05, ALPHA, c05 STF c02, [C1 + 1 * SIZE] FMUL c06, ALPHA, c06 STF c03, [C1 + 2 * SIZE] FMUL c07, ALPHA, c07 STF c04, [C1 + 3 * SIZE] FMUL c08, ALPHA, c08 STF c05, [C2 + 0 * SIZE] FMUL c09, ALPHA, c09 STF c06, [C2 + 1 * SIZE] FMUL c10, ALPHA, c10 STF c07, [C2 + 2 * SIZE] FMUL c11, ALPHA, c11 STF c08, [C2 + 3 * SIZE] FMUL c12, ALPHA, c12 STF c09, [C3 + 0 * SIZE] FMUL c13, ALPHA, c13 STF c10, [C3 + 1 * SIZE] FMUL c14, ALPHA, c14 STF c11, [C3 + 2 * SIZE] FMUL c15, ALPHA, c15 STF c12, [C3 + 3 * SIZE] FMUL c16, ALPHA, c16 STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -4, TEMP1#else add TEMP1, -4, TEMP1#endif sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 4, KK#endif add I, -1, I cmp I, 0#endif sra K, 2, L bg,pt %icc, .LL21 FMOV FZERO, c01.LL50: and M, 2, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL70 FMOV FZERO, c04#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t2 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t3 LDF [B + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [B + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [B + 3 * SIZE], b4 FMOV FZERO, c05#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 4, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05#endif ble,pn %icc, .LL55 FMOV FZERO, c07.LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4.LL55:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 4, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL59 nop.LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2.LL59:#ifndef TRMMKERNEL FADD c02, t1, c02 FMUL c01, ALPHA, c01 LDF [C1 + 0 * SIZE], a1 FADD c04, t2, c04 FMUL c03, ALPHA, c03 LDF [C1 + 1 * SIZE], a2 FADD c06, t3, c06 FMUL c05, ALPHA, c05 LDF [C2 + 0 * SIZE], a3 FADD c08, t4, c08 FMUL c07, ALPHA, c07 LDF [C2 + 1 * SIZE], a4 FMUL c02, ALPHA, c02 FADD c01, a1, c01 LDF [C3 + 0 * SIZE], b1 FMUL c04, ALPHA, c04 FADD c02, a2, c02 LDF [C3 + 1 * SIZE], b2 FMUL c06, ALPHA, c06 FADD c03, a3, c03 LDF [C4 + 0 * SIZE], b3 FMUL c08, ALPHA, c08 FADD c04, a4, c04 LDF [C4 + 1 * SIZE], b4 STF c01, [C1 + 0 * SIZE] FADD c05, b1, c05 STF c02, [C1 + 1 * SIZE] FADD c06, b2, c06 add C1, 2 * SIZE, C1 STF c03, [C2 + 0 * SIZE] FADD c07, b3, c07 STF c04, [C2 + 1 * SIZE] FADD c08, b4, c08 add C2, 2 * SIZE, C2 STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] add C3, 2 * SIZE, C3 STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] add C4, 2 * SIZE, C4#else FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08 FMUL c01, ALPHA, c01 FMUL c03, ALPHA, c03 FMUL c05, ALPHA, c05 FMUL c07, ALPHA, c07 FMUL c02, ALPHA, c02 FMUL c04, ALPHA, c04 FMUL c06, ALPHA, c06 FMUL c08, ALPHA, c08 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -2, TEMP1#else add TEMP1, -4, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 2, KK#endif#endif.LL70: and M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop.LL71:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 4, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4#endif ble,pn %icc, .LL75 nop.LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4.LL75:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 4, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL79 nop.LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4.LL79:#ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C2 + 0 * SIZE], a2 FADD c03, t3, c03 LDF [C3 + 0 * SIZE], a3 FADD c04, t4, c04 LDF [C4 + 0 * SIZE], a4 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE]#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE]#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -1, TEMP1#else add TEMP1, -4, TEMP1#endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 1, KK#endif#endif.LL99: add J, -1, J mov BO, B cmp J, 0 bg,pt %icc, .LL11#if defined(TRMMKERNEL) && !defined(LEFT) add KK, 4, KK#else nop#endif.LL100: /* n & 2 */ sra M, 2, I and N, 2, J cmp J, 0 add C, LDC, C2 ble,pn %icc, .LL200 mov A, AO#if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK#endif mov C, C1 add C2, LDC, C cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03.LL121:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -