📄 gemm_kernel_hummer.s
字号:
add BO, BO, INC bdz- .L107 .align 4.L106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L106 .align 4.L107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4.L108:#ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2#else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -4#else addi TEMP, TEMP, -1#endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 4#endif#endif li r0, FZERO lfpsx f0, SP, r0 .align 4.L110: andi. I, M, 2 beq .L120#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0#else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L114#else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L114#endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L113 .align 4.L112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L112 .align 4.L113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4.L114: lfd AP, ALPHA(SP)#ifdef TRMMKERNEL fsmfp AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 1#endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP#else andi. r0, K, 7 mtspr CTR, r0#endif ble+ .L118 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L117 .align 4.L116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L116 .align 4.L117: fxcpmadd f0, B1, A1, f0 .align 4.L118:#ifndef TRMMKERNEL LFPDX A1, CO1, INC2 fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 fxcpmadd f1, AP, f0, A1 li r0, FZERO lfpsx f0, SP, r0 STFPDUX f1, CO1, INC2#else fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 fpmul f1, AP, f0 li r0, FZERO lfpsx f0, SP, r0 STFPDUX f1, CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -1#endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif .align 4.L120: andi. I, M, 1 beq .L999#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0#else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 1#endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L124#else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L124#endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L123 .align 4.L122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L122 .align 4.L123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4.L124: lfd AP, ALPHA(SP)#ifdef TRMMKERNEL fsmfp AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 1#endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP#else andi. r0, K, 7 mtspr CTR, r0#endif ble+ .L128 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L127 .align 4.L126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L126 .align 4.L127: fmadd f0, A1, B1, f0 .align 4.L128:#ifndef TRMMKERNEL LFDX A1, CO1, INC2 fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fmadd f0, AP, f0, A1#else fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fpmul f0, AP, f0#endif STFDUX f0, CO1, INC2 .align 4.L999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4.L1000: li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE srawi. J, N, 2 ble .L1050 .align 4.L1010: mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC#if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET#endif addi AO, A, -4 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L1020 .align 4.L1011:#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0#else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 8#else addi TEMP, KK, 4#endif srawi. TEMP, TEMP, 2 fpmr f1, f0 mtspr CTR, TEMP ble .L1014#else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 mtspr CTR, r0 ble .L1014#endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L1013 .align 4.L1012:## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L1012 .align 4.L1013:## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11#ifndef TRMMKERNEL LFDUX A1, CO1, INC#else nop#endif fxcsmadd f15, B2, A4, f15 nop## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsma
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -