📄 gemm_kernel.s
字号:
LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4.LL175:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 2, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL179 nop.LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1.LL179:#ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C2 + 0 * SIZE], a2 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FADD c01, a1, c01 FADD c02, a2, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE]#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE]#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -1, TEMP1#else add TEMP1, -2, TEMP1#endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 1, KK#endif#endif.LL199: mov BO, B#if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK#else nop#endif.LL200: and N, 1, J sra M, 2, I cmp J, 0 ble,pn %icc, .LL999 mov A, AO#if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK#endif cmp I, 0 ble,pn %icc, .LL250 mov C, C1.LL221:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 4, L#else add KK, 1, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4#endif ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2.LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4.LL225:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 4, L#else add KK, 1, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL229 nop.LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1.LL229:#ifndef TRMMKERNEL FADD c01, t1, c01 add I, -1, I FADD c02, t2, c02 cmp I, 0 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 LDF [C1 + 2 * SIZE], a3 LDF [C1 + 3 * SIZE], a4 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -4, TEMP1#else add TEMP1, -1, TEMP1#endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 4, KK#endif add I, -1, I cmp I, 0#endif bg,pt %icc, .LL221 nop.LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop.LL251:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 1, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4#endif ble,pn %icc, .LL255 nop.LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO.LL255:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 2, L#else add KK, 1, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL259 nop.LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO.LL259:#ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C1 + 1 * SIZE], a2 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FADD c01, a1, c01 FADD c02, a2, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -2, TEMP1#else add TEMP1, -1, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 2, KK#endif#endif.LL270: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop.LL271:#if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 mov B, BO FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 cmp L, 0 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO#else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 1, L#endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3#endif ble,pn %icc, .LL275 LDF [BO + 3 * SIZE], b4.LL272: FADD c01, t1, c01 add L, -1, L add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 4 * SIZE, BO LDF [AO + 0 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 LDF [BO + 0 * SIZE], b1 FMUL a2, b2, t2 LDF [AO + 1 * SIZE], a2 FADD c01, t3, c01 LDF [BO + 1 * SIZE], b2 FMUL a3, b3, t3 LDF [AO + 2 * SIZE], a3 FADD c02, t4, c02 LDF [BO + 2 * SIZE], b3 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL272 LDF [BO + 3 * SIZE], b4.LL275:#ifndef TRMMKERNEL and K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L#elif defined(LEFT) add KK, 1, L#else add KK, 1, L#endif and L, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL279 nop.LL276: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add BO, 1 * SIZE, BO cmp L, 0 bg,pt %icc, .LL276 add AO, 1 * SIZE, AO.LL279:#ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 FMUL c01, ALPHA, c01 FADD c01, a1, c01 STF c01, [C1 + 0 * SIZE]#else FADD c01, t1, c01 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 FMUL c01, ALPHA, c01 STF c01, [C1 + 0 * SIZE]#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1#ifdef LEFT add TEMP1, -1, TEMP1#else add TEMP1, -1, TEMP1#endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LEFT add KK, 1, KK#endif#endif.LL999: return %i7 + 8 clr %o0 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -