📄 zgemm_kernel_hummer.s
字号:
LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4.L52: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4.L53: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4.L54: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 1#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L58 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4.L56: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4.L57: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4.L58:#ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 LFPDUX A3, CO1, INC2 LFPDUX A4, CO1, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7#else fpsub f0, f0, f4 fpsub f1, f1, f5 fpsub f2, f2, f6 fpsub f3, f3, f7#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f2, AP, A3 fxcpmadd A4, f3, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f2, f2, AP, A3 fxcxnpma f3, f3, AP, A4 STFPDUX f0, CO1, INCM7 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -4#else addi TEMP, TEMP, -1#endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 4#endif#endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4.L60: andi. I, M, 2 beq .L70#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0#else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 2 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L64#else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L64#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4.L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4.L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4.L64: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 1#endif andi. r0, TEMP, 3 mtspr CTR, r0#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ .L68 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4.L66: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4.L67: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4.L68:#ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f2 fpadd f1, f1, f3#else fpsub f0, f0, f2 fpsub f1, f1, f3#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2#else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -1#endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L70: andi. I, M, 1 beq .L89#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0#else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74#else addi BO, B, - 2 * SIZE fpmr f1, f0 srawi. r0, K, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74#endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4.L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4.L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4.L74: li r0, ALPHA lfpdx AP, SP, r0#ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 1#endif andi. r0, TEMP, 7 mtspr CTR, r0#else andi. r0, K, 7 mtspr CTR, r0#endif ble+ .L78 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4.L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4.L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4.L78:#ifndef TRMMKERNEL LFPDX A1, CO1, INC2#endif fpadd f0, f0, f2 fpadd f1, f1, f3#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1#else fpsub f0, f0, f1#endif#ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcxnpma f0, f0, AP, A1#else fxcpmadd f12, f0, AP, f30 fxcxnpma f0, f0, AP, f12#endif STFPDUX f0, CO1, INC2 li r0, FZERO lfpsx f0, SP, r0 .align 4.L89: addi B, BO, 2 * SIZE .align 4.L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -