📄 gemm_kernel_hummer.s
字号:
LFPDUX A2, CO3, INCM3 fxcpmadd f2, AP, f2, B1 LFPDUX A4, CO3, INC4 fxcpmadd f3, AP, f3, A8 LFPDUX A10, CO4, INCM3 fxcpmadd f4, AP, f4, A3 LFPDUX A1, CO4, INC4 fxcpmadd f5, AP, f5, A9 STFPDUX f0, CO1, INCM7 fxcpmadd f6, AP, f6, A5 STFPDUX f1, CO1, INC2 fxcpmadd f7, AP, f7, B4 STFPDUX f2, CO1, INC2 fxcpmadd f8, AP, f8, B3 STFPDUX f3, CO1, INC2 fxcpmadd f9, AP, f9, A2 STFPDUX f4, CO2, INCM7 fxcpmadd f10, AP, f10, A6 STFPDUX f5, CO2, INC2 fxcpmadd f11, AP, f11, A4 STFPDUX f6, CO2, INC2 fxcpmadd f12, AP, f12, A7 STFPDUX f7, CO2, INC2 fxcpmadd f13, AP, f13, A10 STFPDUX f8, CO3, INCM7 fxcpmadd f14, AP, f14, B2 STFPDUX f9, CO3, INC2 fxcpmadd f15, AP, f15, A1 STFPDUX f10, CO3, INC2 STFPDUX f11, CO3, INC2 STFPDUX f12, CO4, INCM7 STFPDUX f13, CO4, INC2 STFPDUX f14, CO4, INC2 STFPDUX f15, CO4, INC2#else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 fpmul f4, AP, f4 fpmul f5, AP, f5 STFPDUX f0, CO1, INC2 fpmul f6, AP, f6 STFPDUX f1, CO1, INC2 fpmul f7, AP, f7 STFPDUX f2, CO1, INC2 fpmul f8, AP, f8 STFPDUX f3, CO1, INC2 fpmul f9, AP, f9 STFPDUX f4, CO2, INC2 fpmul f10, AP, f10 STFPDUX f5, CO2, INC2 fpmul f11, AP, f11 STFPDUX f6, CO2, INC2 fpmul f12, AP, f12 STFPDUX f7, CO2, INC2 fpmul f13, AP, f13 STFPDUX f8, CO3, INC2 fpmul f14, AP, f14 STFPDUX f9, CO3, INC2 fpmul f15, AP, f15 STFPDUX f10, CO3, INC2 STFPDUX f11, CO3, INC2 STFPDUX f12, CO4, INC2 STFPDUX f13, CO4, INC2 STFPDUX f14, CO4, INC2 STFPDUX f15, CO4, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -8#else addi TEMP, TEMP, -4#endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 8#endif#endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4.L20: andi. I, M, 4 beq .L30#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0#else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 4#endif srawi. TEMP, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, TEMP fpmr f13, f0 ble .L24#else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4.L22: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4.L23: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4.L24: lfd AP, ALPHA(SP)#ifdef TRMMKERNEL fsmfp AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 4#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L28 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4.L26: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4.L27: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4.L28:#ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 LFPDUX B3, CO2, INC2 LFPDUX A6, CO2, INC2 LFPDUX B5, CO3, INC2 LFPDUX A8, CO3, INC2 LFPDUX A2, CO4, INC2 LFPDUX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 fxcpmadd f4, AP, f4, B3 fxcpmadd f5, AP, f5, A6 fxcpmadd f8, AP, f8, B5 fxcpmadd f9, AP, f9, A8 STFPDUX f0, CO1, INCM3 fxcpmadd f12, AP, f12, A2 STFPDUX f1, CO1, INC2 fxcpmadd f13, AP, f13, A4 STFPDUX f4, CO2, INCM3 STFPDUX f5, CO2, INC2 STFPDUX f8, CO3, INCM3 STFPDUX f9, CO3, INC2 STFPDUX f12, CO4, INCM3 STFPDUX f13, CO4, INC2#else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f4, AP, f4 fpmul f5, AP, f5 fpmul f8, AP, f8 fpmul f9, AP, f9 STFPDUX f0, CO1, INC2 fpmul f12, AP, f12 STFPDUX f1, CO1, INC2 fpmul f13, AP, f13 STFPDUX f4, CO2, INC2 STFPDUX f5, CO2, INC2 STFPDUX f8, CO3, INC2 STFPDUX f9, CO3, INC2 STFPDUX f12, CO4, INC2 STFPDUX f13, CO4, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -4#else addi TEMP, TEMP, -4#endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 4#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L30: andi. I, M, 2 beq .L40#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0#else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 4#endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34#else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L34#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4.L32: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 LFPDUX B3, BO, INC4 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 LFPDUX A5, BO, INC4 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 LFPDUX A7, BO, INC4 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4.L33: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 .align 4.L34: lfd AP, ALPHA(SP)#ifdef TRMMKERNEL fsmfp AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 4#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L38 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4.L36: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4.L37: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 .align 4.L38:#ifndef TRMMKERNEL LFPDX A1, CO1, INC2 LFPDX A2, CO2, INC2 LFPDX A3, CO3, INC2 LFPDX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A2 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A4#else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3#endif STFPDUX f0, CO1, INC2 STFPDUX f1, CO2, INC2 STFPDUX f2, CO3, INC2 STFPDUX f3, CO4, INC2#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -4#endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L40: andi. I, M, 1 beq .L49#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0#else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 4#endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L44#else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L44#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L43 .align 4.L42: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -