📄 ztrsm_kernel_hummer_rt.s
字号:
FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4.L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4.L24:#if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28#else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28#endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4.L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4.L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4.L28: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13#if defined(LN) || defined(RT)#ifdef LN subi r0, KK, 2#else subi r0, KK, 2#endif slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE#endif#if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f18, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE#else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE#endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f8, f18, f8 fpsub f9, f19, f9#ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5#endif#ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 add AO, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxpmul f6, A3, f1 fxpmul f7, A3, f9 FXCXNPMA f1, A3, f1, f6 FXCXNPMA f9, A3, f9, f7#endif#ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5#endif#ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5#endif#ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE#endif#if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE#else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE#endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC#ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE#endif#ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0#endif#if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, r0#endif#ifdef LT addi KK, KK, 2#endif#ifdef LN subi KK, KK, 2#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L30: andi. I, M, 1 beq .L49#if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34#else#ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0#endif slwi r0 , KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4.L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4.L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4.L34:#if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38#else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38#endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4.L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4.L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4.L38: fpadd f0, f0, f1 fpadd f2, f2, f3#if defined(LN) || defined(RT)#ifdef LN subi r0, KK, 1#else subi r0, KK, 2#endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 4 * SIZE#endif addi AO2, AO, 2 * SIZE addi BO2, BO, 2 * SIZE#if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4#else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4#endif fpsub f0, f16, f0 fpsub f2, f17, f2#ifdef LN LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5#endif#ifdef LT LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5#endif#ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f2, A2, f0, f2 FXCXNSMA f2, A2, f0, f2 fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4#endif#ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 fxcpnmsub f0, A2, f2, f0 FXCXNSMA f0, A2, f2, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4#endif#ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE#endif#if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f2, BO2, INC4#else STFPDX f0, AO, INC4 STFPDX f2, AO2, INC4#endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC#ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE#endif#ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0#endif#if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LT addi KK, KK, 1#endif#ifdef LN subi KK, KK, 1#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L49:#ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0#endif#if defined(LT) || defined(RN) addi B, BO, 4 * SIZE#endif#ifdef RN addi KK, KK, 2#endif#ifdef RT subi KK, KK, 2#endif addic. J, J, -1 bgt+ .L10 .align 4.L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -