⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_hummer.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	add	BO, BO, INC	bdz-	.L107	.align 4.L106:	fxcpmadd	f0,  B1, A1, f0	LFPDUX	A1,  AO,  INC2	fxcpmadd	f1,  B1, A2, f1	LFDX	B1,  BO,  INC2	LFPDUX	A2,  AO,  INC2	add	BO, BO, INC	bdnz+	.L106	.align 4.L107:	fxcpmadd	f0,  B1, A1, f0	fxcpmadd	f1,  B1, A2, f1	.align 4.L108:#ifndef TRMMKERNEL	LFPDUX	A1, CO1, INC2	LFPDUX	B1, CO1, INC2	fpadd	f0, f0, f2	fpadd	f1, f1, f3	fxcpmadd	f0,  AP, f0,  A1	fxcpmadd	f1,  AP, f1,  B1	STFPDUX	f0,  CO1, INCM3	STFPDUX	f1,  CO1, INC2#else	fpadd	f0, f0, f2	fpadd	f1, f1, f3	fpmul	f0,  AP, f0	fpmul	f1,  AP, f1	STFPDUX	f0,  CO1, INC2	STFPDUX	f1,  CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -4#else	addi	TEMP, TEMP, -1#endif	slwi	r0,   TEMP, 2 + BASE_SHIFT	slwi	TEMP, TEMP, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 4#endif#endif	li	r0, FZERO	lfpsx	f0, SP, r0	.align 4.L110:	andi.	I, M,  2	beq	.L120#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	addi	BO,  B,  - 2 * SIZE	fpmr	f1,  f0	fpmr	f2,  f0	fpmr	f3,  f0#else	slwi	TEMP, KK, 1 + BASE_SHIFT	slwi	r0,   KK, 0 + BASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	fpmr	f1,  f0	addi	BO,  BO,  - 2 * SIZE	fpmr	f2,  f0	fpmr	f3,  f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 1#endif	srawi.	r0,  TEMP,  3	mtspr	CTR, r0	ble	.L114#else	addi	BO,  B,  - 2 * SIZE	fpmr	f1,  f0	fpmr	f2,  f0	fpmr	f3,  f0	srawi.	r0,  K,  3	mtspr	CTR, r0	ble	.L114#endif	LFPDUX	A1,  AO,  INC2	LFPDUX	A2,  AO,  INC2	LFPDUX	B1,  BO,  INC2	LFPDUX	A3,  AO,  INC2	LFPDUX	A4,  AO,  INC2	LFPDUX	B2,  BO,  INC2	LFPDUX	A5,  AO,  INC2	LFPDUX	A6,  AO,  INC2	LFPDUX	B3,  BO,  INC2	LFPDUX	A7,  AO,  INC2	LFPDUX	A8,  AO,  INC2	LFPDUX	B4,  BO,  INC2	bdz-	.L113	.align 4.L112:	fxcpmadd	f0,  B1, A1, f0	LFPDUX	A1,  AO,  INC2	fxcsmadd	f1,  B1, A2, f1	LFPDUX	A2,  AO,  INC2	LFPDUX	B1,  BO,  INC2	fxcpmadd	f2,  B2, A3, f2	LFPDUX	A3,  AO,  INC2	fxcsmadd	f3,  B2, A4, f3	LFPDUX	A4,  AO,  INC2	LFPDUX	B2,  BO,  INC2	fxcpmadd	f0,  B3, A5, f0	LFPDUX	A5,  AO,  INC2	fxcsmadd	f1,  B3, A6, f1	LFPDUX	A6,  AO,  INC2	LFPDUX	B3,  BO,  INC2	fxcpmadd	f2,  B4, A7, f2	LFPDUX	A7,  AO,  INC2	fxcsmadd	f3,  B4, A8, f3	LFPDUX	A8,  AO,  INC2	LFPDUX	B4,  BO,  INC2	bdnz+	.L112	.align 4.L113:	fxcpmadd	f0,  B1, A1, f0	fxcsmadd	f1,  B1, A2, f1	fxcpmadd	f2,  B2, A3, f2	fxcsmadd	f3,  B2, A4, f3	fxcpmadd	f0,  B3, A5, f0	fxcsmadd	f1,  B3, A6, f1	fxcpmadd	f2,  B4, A7, f2	fxcsmadd	f3,  B4, A8, f3	.align 4.L114:	lfd	AP,  ALPHA(SP)#ifdef TRMMKERNEL       fsmfp	AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 1#endif	andi.	TEMP,  TEMP,  7	mtspr	CTR, TEMP#else	andi.	r0,  K,  7	mtspr	CTR, r0#endif	ble+	.L118	LFPDUX	A1,  AO,  INC2	LFDX	B1,  BO,  INC2	add	BO, BO, INC	bdz-	.L117	.align 4.L116:	fxcpmadd	f0,  B1, A1, f0	LFPDUX	A1,  AO,  INC2	LFDX	B1,  BO,  INC2	add	BO, BO, INC	bdnz+	.L116	.align 4.L117:	fxcpmadd	f0,  B1, A1, f0	.align 4.L118:#ifndef TRMMKERNEL	LFPDX	A1, CO1, INC2	fpadd	f0, f0, f1	fpadd	f2, f3, f2	fpadd	f0, f0, f2	fxcpmadd	f1,  AP, f0,  A1	li	r0, FZERO	lfpsx	f0, SP, r0	STFPDUX	f1,  CO1, INC2#else	fpadd	f0, f0, f1	fpadd	f2, f3, f2	fpadd	f0, f0, f2	fpmul	f1,  AP, f0	li	r0, FZERO	lfpsx	f0, SP, r0	STFPDUX	f1,  CO1, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -2#else	addi	TEMP, TEMP, -1#endif	slwi	r0,   TEMP, 1 + BASE_SHIFT	slwi	TEMP, TEMP, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	.align 4.L120:	andi.	I, M,  1	beq	.L999#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	addi	BO,  B,  - 2 * SIZE	fpmr	f1,  f0	fpmr	f2,  f0	fpmr	f3,  f0#else	slwi	TEMP, KK, 0 + BASE_SHIFT	slwi	r0,   KK, 0 + BASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	fpmr	f1,  f0	addi	BO,  BO,  - 2 * SIZE	fpmr	f2,  f0	fpmr	f3,  f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 1#endif	srawi.	r0,  TEMP,  3	mtspr	CTR, r0	ble	.L124#else	addi	BO,  B,  - 2 * SIZE	fpmr	f1,  f0	fpmr	f2,  f0	fpmr	f3,  f0	srawi.	r0,  K,  3	mtspr	CTR, r0	ble	.L124#endif	LFPDUX	A1,  AO,  INC2	LFPDUX	B1,  BO,  INC2	LFPDUX	A2,  AO,  INC2	LFPDUX	B2,  BO,  INC2	LFPDUX	A3,  AO,  INC2	LFPDUX	B3,  BO,  INC2	LFPDUX	A4,  AO,  INC2	LFPDUX	B4,  BO,  INC2	bdz-	.L123	.align 4.L122:	fpmadd	f0,  A1, B1, f0	LFPDUX	A1,  AO,  INC2	LFPDUX	B1,  BO,  INC2	fpmadd	f1,  A2, B2, f1	LFPDUX	A2,  AO,  INC2	LFPDUX	B2,  BO,  INC2	fpmadd	f2,  A3, B3, f2	LFPDUX	A3,  AO,  INC2	LFPDUX	B3,  BO,  INC2	fpmadd	f3,  A4, B4, f3	LFPDUX	A4,  AO,  INC2	LFPDUX	B4,  BO,  INC2	bdnz+	.L122	.align 4.L123:	fpmadd	f0,  A1, B1, f0	fpmadd	f1,  A2, B2, f1	fpmadd	f2,  A3, B3, f2	fpmadd	f3,  A4, B4, f3	.align 4.L124:	lfd	AP,  ALPHA(SP)#ifdef TRMMKERNEL       fsmfp	AP, AP#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 1#endif	andi.	TEMP,  TEMP,  7	mtspr	CTR, TEMP#else	andi.	r0,  K,  7	mtspr	CTR, r0#endif	ble+	.L128	LFDX	A1,  AO,  INC2	LFDX	B1,  BO,  INC2	add	AO, AO, INC	add	BO, BO, INC	bdz-	.L127	.align 4.L126:	fmadd	f0,  A1, B1, f0	LFDX	A1,  AO,  INC2	LFDX	B1,  BO,  INC2	add	AO, AO, INC	add	BO, BO, INC	bdnz+	.L126	.align 4.L127:	fmadd	f0,  A1, B1, f0	.align 4.L128:#ifndef TRMMKERNEL	LFDX	A1, CO1, INC2	fpadd	f0, f0, f1	fpadd	f2, f2, f3	fpadd	f0, f0, f2	fsmtp	f1, f0	fadd	f0, f0, f1	fmadd	f0,  AP, f0,  A1#else	fpadd	f0, f0, f1	fpadd	f2, f2, f3	fpadd	f0, f0, f2	fsmtp	f1, f0	fadd	f0, f0, f1	fpmul	f0,  AP, f0#endif	STFDUX	f0,  CO1, INC2	.align 4.L999:	addi	SP, SP, 12	lwzu	r14,   4(SP)	lwzu	r15,   4(SP)	lwzu	r16,   4(SP)	lwzu	r17,   4(SP)	lwzu	r18,   4(SP)	lwzu	r19,   4(SP)	lwzu	r20,   4(SP)	lwzu	r21,   4(SP)	lwzu	r22,   4(SP)	lwzu	r23,   4(SP)	lwzu	r24,   4(SP)	lwzu	r25,   4(SP)	lwzu	r26,   4(SP)	lwzu	r27,   4(SP)	lwzu	r28,   4(SP)	lwzu	r29,   4(SP)	lwzu	r30,   4(SP)	lwzu	r31,   4(SP)	subi	SP, SP, 12	li	r0, 16	lfpdux	f31, SP, r0	lfpdux	f30, SP, r0	lfpdux	f29, SP, r0	lfpdux	f28, SP, r0	lfpdux	f27, SP, r0	lfpdux	f26, SP, r0	lfpdux	f25, SP, r0	lfpdux	f24, SP, r0	lfpdux	f23, SP, r0	lfpdux	f22, SP, r0	lfpdux	f21, SP, r0	lfpdux	f20, SP, r0	lfpdux	f19, SP, r0	lfpdux	f18, SP, r0	lfpdux	f17, SP, r0	lfpdux	f16, SP, r0	lfpdux	f15, SP, r0	lfpdux	f14, SP, r0	addi	SP, SP, 16	blr	.align 4.L1000:	li	INCM1, -1 * SIZE	li	INCM3, -3 * SIZE	li	INCM5, -5 * SIZE	li	INCM7, -7 * SIZE	addi	C, C, - 1 * SIZE	srawi.	J, N,  2	ble	.L1050	.align 4.L1010:	mr	CO1, C	add	CO2, C,   LDC	add	CO3, CO2, LDC	add	CO4, CO3, LDC	add	C,   CO4, LDC#if defined(TRMMKERNEL) &&  defined(LEFT)	mr	KK, OFFSET#endif	addi	AO, A, -4 * SIZE		li	r0, FZERO	lfpsx	f0, SP, r0	srawi.	I, M,  3	ble	.L1020	.align 4.L1011:#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f12, f0#else	slwi	TEMP, KK, 3 + BASE_SHIFT	slwi	r0,   KK, 2 + BASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  BO, - 4 * SIZE	fpmr	f8,  f0	addi	BO2, BO,   2 * SIZE	fpmr	f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 8#else	addi	TEMP, KK, 4#endif	srawi.	TEMP,  TEMP,  2 	fpmr	f1,  f0	mtspr	CTR, TEMP	ble	.L1014#else	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f12, f0	srawi.	r0,  K,  2 	fpmr	f1,  f0	mtspr	CTR, r0	ble	.L1014#endif	LFPDUX	A1,  AO, INC4	fpmr	f5,  f0	LFPDUX	A3,  AO, INC4	fpmr	f9,  f0	LFPDUX	B1,  BO, INC4	fpmr	f13, f0	LFPDUX	A5,  AO, INC4	fpmr	f2,  f0	LFPDUX	A6,  AO, INC4	fpmr	f6,  f0	LFPDUX	B3,  BO, INC4	fpmr	f10, f0	LFPDUX	A7,  AO, INC4	fpmr	f14, f0	LFPDUX	A8,  AO, INC4	fpmr	f3,  f0	LFPDUX	B5,  BO, INC4	fpmr	f7,  f0	LFPDUX	A9,  AO, INC4	fpmr	f11, f0	LFPDUX	A2, AO2, INC4	fpmr	f15, f0	LFPDUX	B2, BO2, INC4	bdz-	.L1013	.align 4.L1012:## 1 ##	fxcpmadd	f0,  B1, A1, f0	nop	fxcsmadd	f4,  B1, A1, f4	nop	fxcpmadd	f8,  B2, A1, f8	LFPDUX	B4, BO2, INC4	fxcsmadd	f12, B2, A1, f12	LFPDUX	B6,  BO, INC4	fxcpmadd	f1,  B1, A2, f1	nop	fxcsmadd	f5,  B1, A2, f5	LFPDUX	A4, AO2, INC4	fxcpmadd	f9,  B2, A2, f9	LFPDUX	A10, AO, INC4	fxcsmadd	f13, B2, A2, f13	nop	fxcpmadd	f2,  B1, A3, f2	nop	fxcsmadd	f6,  B1, A3, f6	nop	fxcpmadd	f10, B2, A3, f10	nop	fxcsmadd	f14, B2, A3, f14	nop	fxcpmadd	f3,  B1, A4, f3	nop	fxcsmadd	f7,  B1, A4, f7	LFPDUX	A2, AO2, INC4	fxcpmadd	f11, B2, A4, f11	LFPDUX	A1,  AO, INC4	fxcsmadd	f15, B2, A4, f15	nop## 2 ##	fxcpmadd	f0,  B3, A5, f0	nop	fxcsmadd	f4,  B3, A5, f4	nop	fxcpmadd	f8,  B4, A5, f8	LFPDUX	B2, BO2, INC4	fxcsmadd	f12, B4, A5, f12	LFPDUX	B1,  BO, INC4	fxcpmadd	f1,  B3, A2, f1	nop	fxcsmadd	f5,  B3, A2, f5	LFPDUX	A4, AO2, INC4	fxcpmadd	f9,  B4, A2, f9	LFPDUX	A3,  AO, INC4	fxcsmadd	f13, B4, A2, f13	nop	fxcpmadd	f2,  B3, A6, f2	nop	fxcsmadd	f6,  B3, A6, f6	nop	fxcpmadd	f10, B4, A6, f10	nop	fxcsmadd	f14, B4, A6, f14	nop	fxcpmadd	f3,  B3, A4, f3	nop	fxcsmadd	f7,  B3, A4, f7	LFPDUX	A2, AO2, INC4	fxcpmadd	f11, B4, A4, f11	LFPDUX	A5,  AO, INC4	fxcsmadd	f15, B4, A4, f15	nop## 3 ##	fxcpmadd	f0,  B5, A7, f0	nop	fxcsmadd	f4,  B5, A7, f4	nop	fxcpmadd	f8,  B2, A7, f8	LFPDUX	B4, BO2, INC4	fxcsmadd	f12, B2, A7, f12	LFPDUX	B3,  BO, INC4	fxcpmadd	f1,  B5, A2, f1	nop	fxcsmadd	f5,  B5, A2, f5	LFPDUX	A4, AO2, INC4	fxcpmadd	f9,  B2, A2, f9	LFPDUX	A6,  AO, INC4	fxcsmadd	f13, B2, A2, f13	nop	fxcpmadd	f2,  B5, A8, f2	nop	fxcsmadd	f6,  B5, A8, f6	nop	fxcpmadd	f10, B2, A8, f10	nop	fxcsmadd	f14, B2, A8, f14	nop	fxcpmadd	f3,  B5, A4, f3	nop	fxcsmadd	f7,  B5, A4, f7	LFPDUX	A2, AO2, INC4	fxcpmadd	f11, B2, A4, f11	LFPDUX	A7,  AO, INC4	fxcsmadd	f15, B2, A4, f15	nop## 4 ##	fxcpmadd	f0,  B6, A9, f0	nop	fxcsmadd	f4,  B6, A9, f4	nop	fxcpmadd	f8,  B4, A9, f8	LFPDUX	B2, BO2, INC4	fxcsmadd	f12, B4, A9, f12	LFPDUX	B5,  BO, INC4	fxcpmadd	f1,  B6, A2, f1	nop	fxcsmadd	f5,  B6, A2, f5	LFPDUX	A4, AO2, INC4	fxcpmadd	f9,  B4, A2, f9	LFPDUX	A8,  AO, INC4	fxcsmadd	f13, B4, A2, f13	nop	fxcpmadd	f2,  B6, A10, f2	nop	fxcsmadd	f6,  B6, A10, f6	nop	fxcpmadd	f10, B4, A10, f10	nop	fxcsmadd	f14, B4, A10, f14	nop	fxcpmadd	f3,  B6, A4, f3	LFPDUX	A2, AO2, INC4	fxcsmadd	f7,  B6, A4, f7	LFPDUX	A9,  AO, INC4	fxcpmadd	f11, B4, A4, f11	nop		fxcsmadd	f15, B4, A4, f15	bdnz+	.L1012	.align 4.L1013:## 1 ##	fxcpmadd	f0,  B1, A1, f0	nop	fxcsmadd	f4,  B1, A1, f4	nop	fxcpmadd	f8,  B2, A1, f8	LFPDUX	B4, BO2, INC4	fxcsmadd	f12, B2, A1, f12	LFPDUX	B6,  BO, INC4	fxcpmadd	f1,  B1, A2, f1	nop	fxcsmadd	f5,  B1, A2, f5	LFPDUX	A4, AO2, INC4	fxcpmadd	f9,  B2, A2, f9	LFPDUX	A10, AO, INC4	fxcsmadd	f13, B2, A2, f13	nop	fxcpmadd	f2,  B1, A3, f2	nop	fxcsmadd	f6,  B1, A3, f6	nop	fxcpmadd	f10, B2, A3, f10	nop	fxcsmadd	f14, B2, A3, f14	nop	fxcpmadd	f3,  B1, A4, f3	nop	fxcsmadd	f7,  B1, A4, f7	LFPDUX	A2, AO2, INC4	fxcpmadd	f11, B2, A4, f11#ifndef TRMMKERNEL	LFDUX	A1, CO1, INC#else	nop#endif	fxcsmadd	f15, B2, A4, f15	nop## 2 ##	fxcpmadd	f0,  B3, A5, f0	nop	fxcsmadd	f4,  B3, A5, f4	nop	fxcpmadd	f8,  B4, A5, f8	LFPDUX	B2, BO2, INC4	fxcsma

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -