📄 gemm_kernel.s
字号:
FMOV FZERO, t1 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t2 LDF [B + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [B + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 4, L#else add KK, 2, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02#endif ble,pn %icc, .LL125 FMOV FZERO, c06.LL122: FADD c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b1, t3 nop FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4.LL125:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 4, L#else add KK, 2, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL129 nop.LL126: FADD c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 FADD c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4.LL129:#ifndef TRMMKERNEL FADD c03, t1, c03 add I, -1, I LDF [C1 + 0 * SIZE], a1 FADD c07, t2, c07 cmp I, 0 LDF [C1 + 1 * SIZE], a2 FADD c04, t3, c04 LDF [C1 + 2 * SIZE], a3 FADD c08, t4, c08 LDF [C1 + 3 * SIZE], a4 LDF [C2 + 0 * SIZE], b1 FMUL c01, ALPHA, c01 LDF [C2 + 1 * SIZE], b2 FMUL c02, ALPHA, c02 LDF [C2 + 2 * SIZE], b3 FMUL c03, ALPHA, c03 LDF [C2 + 3 * SIZE], b4 FMUL c04, ALPHA, c04 FMUL c05, ALPHA, c05 FADD c01, a1, c01 FMUL c06, ALPHA, c06 FADD c02, a2, c02 FMUL c07, ALPHA, c07 FADD c03, a3, c03 FMUL c08, ALPHA, c08 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] FADD c05, b1, c05 STF c02, [C1 + 1 * SIZE] FADD c06, b2, c06 STF c03, [C1 + 2 * SIZE] FADD c07, b3, c07 STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1 FADD c08, b4, c08 STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] add C2, 4 * SIZE, C2#else FADD c03, t1, c03 FADD c07, t2, c07 FADD c04, t3, c04 FADD c08, t4, c08 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FMUL c05, ALPHA, c05 FMUL c06, ALPHA, c06 FMUL c07, ALPHA, c07 FMUL c08, ALPHA, c08 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -4, TEMP1#else add TEMP1, -2, TEMP1#endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 4, KK#endif add I, -1, I cmp I, 0#endif bg,pt %icc, .LL121 FMOV FZERO, c03.LL150: and M, 2, I cmp I, 0 ble,pn %icc, .LL170 nop.LL151:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 2, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4#endif ble,pn %icc, .LL155 nop.LL152: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4.LL155:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 2, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL159 nop.LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop.LL159:#ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C2 + 0 * SIZE], a2 LDF [C1 + 1 * SIZE], a3 LDF [C2 + 1 * SIZE], a4 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1 STF c04, [C2 + 1 * SIZE] add C2, 2 * SIZE, C2#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c04, [C2 + 1 * SIZE] add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -2, TEMP1#else add TEMP1, -2, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 2, KK#endif#endif.LL170: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop.LL171:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 2, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4#endif ble,pn %icc, .LL175 nop.LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -