📄 trsm_kernel_rt.s
字号:
FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4.LL155:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL159 nop.LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop.LL159: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04#if defined(LN) || defined(RT)#ifdef LN sub KK, 2, TEMP1#else sub KK, 2, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04#endif#ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 FMUL a3, c01, c01 FMUL a3, c02, c02#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c03, c03 FMUL a3, c04, c04#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a2, c01, t1 FMUL a2, c03, t2 FSUB c02, t1, c02 FSUB c04, t2, c04 FMUL a3, c02, c02 FMUL a3, c04, c04#endif#ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a2, c02, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c03, t2, c03 FMUL a3, c01, c01 FMUL a3, c03, c03#endif#ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c02, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c02, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2#endif#ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 2, KK#endif#ifdef LN sub KK, 2, KK#endif.LL170: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL175 nop.LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4.LL175:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL179 nop.LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1.LL179: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02#if defined(LN) || defined(RT)#ifdef LN sub KK, 1, TEMP1#else sub KK, 2, TEMP1#endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#endif#ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02#endif#ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02#endif#ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01#endif#ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2#endif#ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 1, KK#endif#ifdef LN sub KK, 1, KK#endif.LL199:#ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 add B, TEMP1, B#endif#if defined(LT) || defined(RN) mov BO, B#endif#ifdef RN add KK, 2, KK#endif#ifdef RT sub KK, 2, KK#endif.LL200: sra N, 2, J cmp J, 0 ble,pn %icc, .LL999 nop.LL11:#ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 2, TEMP1 sub C, TEMP1, C#endif add C, LDC, C2 FMOV FZERO, t1 nop mov C, C1 add C2, LDC, C3 FMOV FZERO, t2 nop mov A, AO sra M, 2, I add C3, LDC, C4 FMOV FZERO, t3#ifdef LN add M, OFFSET, KK#endif#ifdef LT mov OFFSET, KK#endif#if defined(LN) || defined(RT) mov A, AORIG#else mov A, AO#endif cmp I, 0#ifndef RT add C4, LDC, C#endif FMOV FZERO, t4 ble,pn %icc, .LL50 FMOV FZERO, c01.LL21: FMOV FZERO, c02 FMOV FZERO, c03#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16.LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop#ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY#else nop#endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06#ifdef DOUBLE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -