📄 trsm_kernel_rt.s
字号:
FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4.LL55:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL59 nop.LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2.LL59:#if defined(LN) || defined(RT)#ifdef LN sub KK, 2, TEMP1#else sub KK, 4, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08#endif#ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a2, c02, t1 FMUL a2, c04, t2 FMUL a2, c06, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c03, t2, c03 FSUB c05, t3, c05 FSUB c07, t4, c07 FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a2, c01, t1 FMUL a2, c03, t2 FMUL a2, c05, t3 FMUL a2, c07, t4 FSUB c02, t1, c02 FSUB c04, t2, c04 FSUB c06, t3, c06 FSUB c08, t4, c08 FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a4, c01, t1 FMUL a4, c02, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c04, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08#endif#ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c07, t1 FMUL a2, c08, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c07, t1 FMUL a3, c08, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a4, c07, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c05, t1 FMUL a3, c06, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02#endif#ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4#endif#ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 2, KK#endif#ifdef LN sub KK, 2, KK#endif.LL70: and M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL75 nop.LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4.LL75:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL79 nop.LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4.LL79: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04#if defined(LN) || defined(RT)#ifdef LN sub KK, 1, TEMP1#else sub KK, 4, TEMP1#endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#endif#ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04#endif#ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [BO + 15 * SIZE], a1 FMUL a1, c04, c04#endif#ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01#endif#ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 add C3, 1 * SIZE, C3 add C4, 1 * SIZE, C4#endif#ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 1, KK#endif#ifdef LN sub KK, 1, KK#endif.LL99:#ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 add B, TEMP1, B#endif#if defined(LT) || defined(RN) mov BO, B#endif#ifdef RN add KK, 4, KK#endif#ifdef RT sub KK, 4, KK#endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop.LL999: return %i7 + 8 clr %o0 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -