📄 trsm_kernel_rt.s
字号:
prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY#else nop#endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4.LL25:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL29 nop.LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4.LL29:#if defined(LN) || defined(RT) sub KK, 4, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO#endif FADD c04, t1, c04 FADD c08, t2, c08 FADD c12, t3, c12 FADD c16, t4, c16#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c09, c09 FSUB a4, c13, c13 FSUB b1, c02, c02 FSUB b2, c06, c06 FSUB b3, c10, c10 FSUB b4, c14, c14 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c03, c03 FSUB a2, c07, c07 FSUB a3, c11, c11 FSUB a4, c15, c15 FSUB b1, c04, c04 FSUB b2, c08, c08 FSUB b3, c12, c12 FSUB b4, c16, c16#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16#endif#ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 FMUL a2, c04, t1 FMUL a2, c08, t2 FMUL a2, c12, t3 FMUL a2, c16, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c04, t1 FMUL a3, c08, t2 FMUL a3, c12, t3 FMUL a3, c16, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a4, c04, t1 FMUL a4, c08, t2 FMUL a4, c12, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c03, t1 FMUL a3, c07, t2 FMUL a3, c11, t3 FMUL a3, c15, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 FMUL a2, c01, t1 FMUL a2, c05, t2 FMUL a2, c09, t3 FMUL a2, c13, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c01, t1 FMUL a3, c05, t2 FMUL a3, c09, t3 FMUL a3, c13, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a4, c01, t1 FMUL a4, c05, t2 FMUL a4, c09, t3 FMUL a4, c13, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c02, t1 FMUL a3, c06, t2 FMUL a3, c10, t3 FMUL a3, c14, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a4, c01, t1 FMUL a4, c02, t2 FMUL a4, c03, t3 FMUL a4, c04, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a3, c07, t3 FMUL a3, c08, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 15 * SIZE], a1 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16#endif#ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 FMUL a2, c13, t1 FMUL a2, c14, t2 FMUL a2, c15, t3 FMUL a2, c16, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c13, t1 FMUL a3, c14, t2 FMUL a3, c15, t3 FMUL a3, c16, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a4, c13, t1 FMUL a4, c14, t2 FMUL a4, c15, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04#endif#ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 add C3, -4 * SIZE, C3 add C4, -4 * SIZE, C4#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c13, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c06, [BO + 5 * SIZE] STF c10, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] STF c03, [BO + 8 * SIZE] STF c07, [BO + 9 * SIZE] STF c11, [BO + 10 * SIZE] STF c15, [BO + 11 * SIZE] STF c04, [BO + 12 * SIZE] STF c08, [BO + 13 * SIZE] STF c12, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c11, [C3 + 2 * SIZE] STF c12, [C3 + 3 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4#endif#ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 4, KK#endif#ifdef LN sub KK, 4, KK#endif add I, -1, I cmp I, 0 sra K, 2, L bg,pt %icc, .LL21 FMOV FZERO, c01.LL50: and M, 2, I cmp I, 0 ble,pn %icc, .LL70 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07.LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -