📄 zgemm_kernel_hummer.s
字号:
fpsub f2, f2, f6 fpsub f10, f10, f14 fpsub f3, f3, f7 fpsub f11, f11, f15#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd B1, f1, AP, B1 fxcpmadd A3, f2, AP, A3 fxcpmadd A5, f3, AP, A5 fxcxnpma f0, f0, AP, A1 fxcpmadd B3, f8, AP, B3 fxcxnpma f1, f1, AP, B1 fxcpmadd A6, f9, AP, A6 fxcxnpma f2, f2, AP, A3 fxcpmadd A7, f10, AP, A7 fxcxnpma f3, f3, AP, A5 fxcpmadd B2, f11, AP, B2 fxcxnpma f8, f8, AP, B3 STFPDUX f0, CO1, INCM7 fxcxnpma f9, f9, AP, A6 STFPDUX f1, CO1, INC2 fxcxnpma f10, f10, AP, A7 STFPDUX f2, CO1, INC2 fxcxnpma f11, f11, AP, B2 STFPDUX f3, CO1, INC2 STFPDUX f8, CO2, INCM7 STFPDUX f9, CO2, INC2 STFPDUX f10, CO2, INC2 STFPDUX f11, CO2, INC2#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 fxcpmadd f16, f8, AP, f30 fxcpmadd f17, f9, AP, f30 fxcpmadd f18, f10, AP, f30 fxcpmadd f19, f11, AP, f30 fxcxnpma f8, f8, AP, f16 fxcxnpma f9, f9, AP, f17 fxcxnpma f10, f10, AP, f18 fxcxnpma f11, f11, AP, f19 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 STFPDUX f8, CO2, INC2 STFPDUX f9, CO2, INC2 STFPDUX f10, CO2, INC2 STFPDUX f11, CO2, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -4#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 4#endif#endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4.L20: andi. I, M, 2 beq .L30#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0#else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24#else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4.L22: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4.L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4.L24: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L28 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4.L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4.L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4.L28:#ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 LFPDUX A3, CO2, INC2 LFPDUX A4, CO2, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13#else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f8, AP, A3 fxcpmadd A4, f9, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f8, f8, AP, A3 fxcxnpma f9, f9, AP, A4 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2 STFPDUX f8, CO2, INCM3 STFPDUX f9, CO2, INC2#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f8, AP, f30 fxcpmadd f15, f9, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f8, f8, AP, f14 fxcxnpma f9, f9, AP, f15 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f8, CO2, INC2 STFPDUX f9, CO2, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L30: andi. I, M, 1 beq .L49#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0#else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34#else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L34#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4.L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4.L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4.L34: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L38 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4.L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4.L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4.L38:#ifndef TRMMKERNEL LFPDX A1, CO1, INC2 LFPDX A2, CO2, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 fpadd f2, f2, f3#else fpsub f0, f0, f1 fpsub f2, f2, f3#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f2, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f2, f2, AP, A2#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f2, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f2, f2, AP, f13#endif STFPDUX f0, CO1, INC2 STFPDUX f2, CO2, INC2#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -1#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 1#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L49:#if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2#endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L10 .align 4.L50: andi. J, N, 1 beq .L999 mr CO1, C#if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET#endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L60 .align 4.L51:#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0#else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54#else srawi. r0, K, 2 fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -