📄 ztrsm_kernel_hummer_ln.s
字号:
#if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f20, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f22, BO2, INC4 LFPDUX f19, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE#else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE#endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f10, f22, f10 fpsub f11, f23, f11#ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A4, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 LFPDUX A9, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A10, f3 fxpmul f5, A10, f11 FXCXNPMA f3, A10, f3, f4 FXCXNPMA f11, A10, f11, f5 fxcpnmsub f2, A9, f3, f2 fxcpnmsub f10, A9, f11, f10 FXCXNSMA f2, A9, f3, f2 FXCXNSMA f10, A9, f11, f10 fxcpnmsub f1, A8, f3, f1 fxcpnmsub f9, A8, f11, f9 FXCXNSMA f1, A8, f3, f1 FXCXNSMA f9, A8, f11, f9 fxcpnmsub f0, A7, f3, f0 fxcpnmsub f8, A7, f11, f8 FXCXNSMA f0, A7, f3, f0 FXCXNSMA f8, A7, f11, f8 fxpmul f4, A6, f2 fxpmul f5, A6, f10 FXCXNPMA f2, A6, f2, f4 FXCXNPMA f10, A6, f10, f5 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 FXCXNSMA f1, A5, f2, f1 FXCXNSMA f9, A5, f10, f9 fxcpnmsub f0, A4, f2, f0 fxcpnmsub f8, A4, f10, f8 FXCXNSMA f0, A4, f2, f0 FXCXNSMA f8, A4, f10, f8 fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5#endif#ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 LFPDUX A7, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A8, AO, INC4 LFPDUX A9, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 FXCXNSMA f2, A3, f0, f2 FXCXNSMA f10, A3, f8, f10 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 FXCXNSMA f3, A4, f0, f3 FXCXNSMA f11, A4, f8, f11 fxpmul f6, A5, f1 fxpmul f7, A5, f9 FXCXNPMA f1, A5, f1, f6 FXCXNPMA f9, A5, f9, f7 fxcpnmsub f2, A6, f1, f2 fxcpnmsub f10, A6, f9, f10 FXCXNSMA f2, A6, f1, f2 FXCXNSMA f10, A6, f9, f10 fxcpnmsub f3, A7, f1, f3 fxcpnmsub f11, A7, f9, f11 FXCXNSMA f3, A7, f1, f3 FXCXNSMA f11, A7, f9, f11 fxpmul f4, A8, f2 fxpmul f5, A8, f10 FXCXNPMA f2, A8, f2, f4 FXCXNPMA f10, A8, f10, f5 fxcpnmsub f3, A9, f2, f3 fxcpnmsub f11, A9, f10, f11 FXCXNSMA f3, A9, f2, f3 FXCXNSMA f11, A9, f10, f11 fxpmul f6, A10, f3 fxpmul f7, A10, f11 FXCXNPMA f3, A10, f3, f6 FXCXNPMA f11, A10, f11, f7#endif#ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 FXCXNSMA f10, A2, f2, f10 FXCXNSMA f11, A2, f3, f11 fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7#endif#ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 fxcpnmsub f2, A2, f10, f2 fxcpnmsub f3, A2, f11, f3 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 FXCXNSMA f2, A2, f10, f2 FXCXNSMA f3, A2, f11, f3 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7#endif#ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE#endif#if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE#else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE#endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC STFDUX f10, CO2, INC STFSDUX f10, CO2, INC STFDUX f11, CO2, INC STFSDUX f11, CO2, INC#ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE#endif#ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0#endif#if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LT addi KK, KK, 4#endif#ifdef LN subi KK, KK, 4#endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4.L49:#ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0#endif#if defined(LT) || defined(RN) addi B, BO, 4 * SIZE#endif#ifdef RN addi KK, KK, 2#endif#ifdef RT subi KK, KK, 2#endif addic. J, J, -1 bgt+ .L10 .align 4.L50: andi. J, N, 1 beq .L999#ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC#endif mr CO1, C#ifdef LN add KK, M, OFFSET#endif#ifdef LT mr KK, OFFSET#endif#if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE#else addi AO, A, -2 * SIZE#endif#ifndef RT add C, CO2, LDC#endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L60#if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74#else#ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0#endif slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74#endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4.L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4.L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4.L74:#if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78#else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78#endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4.L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4.L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4.L78: fpadd f0, f0, f2 fpadd f1, f1, f3 fpadd f0, f0, f1#if defined(LN) || defined(RT)#ifdef LN subi r0, KK, 1#else subi r0, KK, 1#endif slwi TEMP, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP addi BO, BO, - 2 * SIZE#endif#if defined(LN) || defined(LT) LFPDX f16, BO, INC2#else LFPDX f16, AO, INC2#endif fpsub f0, f16, f0#ifdef LN LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4#endif#ifdef LT LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4#endif#ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4#endif#ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4#endif#ifdef LN subi CO1, CO1, 2 * SIZE#endif#if defined(LN) || defined(LT) STFPDX f0, BO, INC2#else STFPDX f0, AO, INC2#endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC#ifdef LN subi CO1, CO1, 2 * SIZE#endif#ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0#endif#if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP#endif#ifdef LT addi KK, KK, 1#endif#ifdef LN subi KK, KK, 1#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L60: andi. I, M, 2 beq .L70#if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64#else#ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0#endif slwi r0 , KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4.L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4.L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -