📄 ztrsm_kernel_ln.s
字号:
#else mov A, AO#endif#ifndef RT add C, LDC, C#endif and M, 1, I cmp I, 0 ble,pn %icc, .LL150 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop.LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4.LL155:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL159 nop.LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2.LL159: FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02#if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#endif#ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02#endif#ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02#endif#ifdef LN add C1, -2 * SIZE, C1#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 2 * SIZE, C1#endif#ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 1, KK#endif#ifdef LN sub KK, 1, KK#endif.LL150: sra M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop.LL121:#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif FMOV FZERO, c03 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01#ifdef LN prefetch [C1 - 3 * SIZE], 3#else prefetch [C1 + 3 * SIZE], 3#endif FMOV FZERO, c05 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06.LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4.LL125:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL129 nop.LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4.LL129: FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04#if defined(LN) || defined(RT)#ifdef LN sub KK, 2, TEMP1#else sub KK, 1, TEMP1#endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#endif#ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a4, c04, t5 FMUL a4, c03, t6 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t5 FMUL a4, c01, t6 FSUB c03, t1, c03 FSUB c04, t2, c04 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04#endif#ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04#endif#ifdef LN add C1, -4 * SIZE, C1#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 4 * SIZE, C1#endif#ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 2, KK#endif#ifdef LN sub KK, 2, KK#endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03.LL199:#ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B#endif#if defined(LT) || defined(RN) mov BO, B#endif#ifdef RN add KK, 1, KK#endif#ifdef RT sub KK, 1, KK#endif.LL999: return %i7 + 8 clr %o0 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -