📄 zgemm_kernel_hummer.s
字号:
add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L1024#else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L1024#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L1023 .align 4.L1022: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L1022 .align 4.L1023: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4.L1024: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L1028 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L1027 .align 4.L1026: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L1026 .align 4.L1027: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4.L1028:#ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC2 LFDUX A3, CO2, INC LFDUX A4, CO2, INC2 LFSDUX A1, CO1, INCM1 LFSDUX A2, CO1, INC2 LFSDUX A3, CO2, INCM1 LFSDUX A4, CO2, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13#else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f8, AP, A3 fxcpmadd A4, f9, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f8, f8, AP, A3 fxcxnpma f9, f9, AP, A4 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INCM3 STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f8, AP, f30 fxcpmadd f15, f9, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f8, f8, AP, f14 fxcxnpma f9, f9, AP, f15 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L1030: andi. I, M, 1 beq .L1049#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0#else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L1034#else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L1034#endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L1033 .align 4.L1032: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L1032 .align 4.L1033: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4.L1034: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L1038 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L1037 .align 4.L1036: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L1036 .align 4.L1037: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4.L1038:#ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO2, INC LFSDUX A1, CO1, INC LFSDUX A2, CO2, INC#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 fpadd f2, f2, f3#else fpsub f0, f0, f1 fpsub f2, f2, f3#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f2, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f2, f2, AP, A2 STFDUX f0, CO1, INCM1 STFSDUX f0, CO1, INC STFDUX f2, CO2, INCM1 STFSDUX f2, CO2, INC#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f2, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f2, f2, AP, f13 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -1#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 1#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L1049:#if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2#endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L1010 .align 4.L1050: andi. J, N, 1 beq .L10999 mr CO1, C#if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET#endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L1060 .align 4.L1051:#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0#else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054#else srawi. r0, K, 2 fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1053 .align 4.L1052: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L1052 .align 4.L1053: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -