⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_hummer.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	fpsub	f2,  f2,  f6	fpsub	f10, f10, f14	fpsub	f3,  f3,  f7	fpsub	f11, f11, f15#endif#ifndef TRMMKERNEL	fxcpmadd A1,  f0, AP,  A1	fxcpmadd B1,  f1, AP,  B1	fxcpmadd A3,  f2, AP,  A3	fxcpmadd A5,  f3, AP,  A5	fxcxnpma f0,  f0, AP,  A1	fxcpmadd B3,  f8,  AP,  B3	fxcxnpma f1,  f1, AP,  B1	fxcpmadd A6,  f9,  AP,  A6	fxcxnpma f2,  f2, AP,  A3	fxcpmadd A7,  f10, AP,  A7	fxcxnpma f3,  f3, AP,  A5	fxcpmadd B2,  f11, AP,  B2	fxcxnpma f8,  f8,  AP,  B3	STFPDUX	f0,  CO1, INCM7	fxcxnpma f9,  f9,  AP,  A6	STFPDUX	f1,  CO1, INC2	fxcxnpma f10, f10, AP,  A7	STFPDUX	f2,  CO1, INC2	fxcxnpma f11, f11, AP,  B2	STFPDUX	f3,  CO1, INC2	STFPDUX	f8,  CO2, INCM7	STFPDUX	f9,  CO2, INC2	STFPDUX	f10, CO2, INC2	STFPDUX	f11, CO2, INC2#else	fxcpmadd f12, f0,  AP,  f30	fxcpmadd f13, f1,  AP,  f30	fxcpmadd f14, f2,  AP,  f30	fxcpmadd f15, f3,  AP,  f30	fxcxnpma f0,  f0,  AP,  f12	fxcxnpma f1,  f1,  AP,  f13	fxcxnpma f2,  f2,  AP,  f14	fxcxnpma f3,  f3,  AP,  f15	fxcpmadd f16, f8,  AP,  f30	fxcpmadd f17, f9,  AP,  f30	fxcpmadd f18, f10, AP,  f30	fxcpmadd f19, f11, AP,  f30	fxcxnpma f8,  f8,  AP,  f16	fxcxnpma f9,  f9,  AP,  f17	fxcxnpma f10, f10, AP,  f18	fxcxnpma f11, f11, AP,  f19	STFPDUX	f0,  CO1, INC2	STFPDUX	f1,  CO1, INC2	STFPDUX	f2,  CO1, INC2	STFPDUX	f3,  CO1, INC2	STFPDUX	f8,  CO2, INC2	STFPDUX	f9,  CO2, INC2	STFPDUX	f10, CO2, INC2	STFPDUX	f11, CO2, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -4#else	addi	TEMP, TEMP, -2#endif	slwi	r0,   TEMP, 2 + ZBASE_SHIFT	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 4#endif#endif	addic.	I, I, -1	li	r0, FZERO	lfpsx	f0, SP, r0	bgt+	.L11	.align 4.L20:	andi.	I, M,  2	beq	.L30#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f12, f0#else	slwi	TEMP, KK, 1 + ZBASE_SHIFT	slwi	r0,   KK, 1 + ZBASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  BO, - 4 * SIZE	fpmr	f8,  f0	addi	BO2, BO,   2 * SIZE	fpmr	f12, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 2#endif	srawi.	r0,  TEMP,  2 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f9,  f0	mtspr	CTR, r0	fpmr	f13, f0	ble	.L24#else	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f12, f0	srawi.	r0,  K,  2 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f9,  f0	mtspr	CTR, r0	fpmr	f13, f0	ble	.L24#endif	LFPDUX	A1,   AO, INC4	LFPDUX	B1,   BO, INC4	LFPDUX	A2,  AO2, INC4	LFPDUX	B2,  BO2, INC4	LFPDUX	A3,   AO, INC4	LFPDUX	B3,   BO, INC4	LFPDUX	A4,  AO2, INC4	LFPDUX	B4,  BO2, INC4	LFPDUX	A5,   AO, INC4	LFPDUX	B5,   BO, INC4	LFPDUX	A6,  AO2, INC4	LFPDUX	B6,  BO2, INC4	LFPDUX	A7,   AO, INC4	LFPDUX	A9,   BO, INC4	LFPDUX	A10, BO2, INC4	bdz-	.L23	.align 4.L22:	FXCPMADD	f0,  B1, A1, f0	nop	FXCSMADD	f4,  B1, A1, f4	LFPDUX	A8,  AO2, INC4	FXCPMADD	f8,  B2, A1, f8	nop	FXCSMADD	f12, B2, A1, f12	LFPDUX	A1,   AO, INC4	FXCPMADD	f1,  B1, A2, f1	nop	FXCSMADD	f5,  B1, A2, f5	LFPDUX	B1,   BO, INC4	FXCPMADD	f9,  B2, A2, f9	nop	FXCSMADD	f13, B2, A2, f13	LFPDUX	B2,  BO2, INC4	FXCPMADD	f0,  B3, A3, f0	nop	FXCSMADD	f4,  B3, A3, f4	LFPDUX	A2,  AO2, INC4	FXCPMADD	f8,  B4, A3, f8	nop	FXCSMADD	f12, B4, A3, f12	LFPDUX	A3,   AO, INC4	FXCPMADD	f1,  B3, A4, f1	nop	FXCSMADD	f5,  B3, A4, f5	LFPDUX	B3,   BO, INC4	FXCPMADD	f9,  B4, A4, f9	nop	FXCSMADD	f13, B4, A4, f13	LFPDUX	B4,  BO2, INC4	FXCPMADD	f0,  B5, A5, f0	nop	FXCSMADD	f4,  B5, A5, f4	LFPDUX	A4,  AO2, INC4	FXCPMADD	f8,  B6, A5, f8	nop	FXCSMADD	f12, B6, A5, f12	LFPDUX	A5,   AO, INC4	FXCPMADD	f1,  B5, A6, f1	nop	FXCSMADD	f5,  B5, A6, f5	LFPDUX	B5,   BO, INC4	FXCPMADD	f9,  B6, A6, f9	nop	FXCSMADD	f13, B6, A6, f13	LFPDUX	B6,  BO2, INC4	FXCPMADD	f0,  A9,  A7, f0	nop	FXCSMADD	f4,  A9,  A7, f4	LFPDUX	A6,  AO2, INC4	FXCPMADD	f8,  A10, A7, f8	nop	FXCSMADD	f12, A10, A7, f12	LFPDUX	A7,   AO, INC4	FXCPMADD	f1,  A9,  A8, f1	nop	FXCSMADD	f5,  A9,  A8, f5	LFPDUX	A9,   BO, INC4	FXCPMADD	f9,  A10, A8, f9	nop	FXCSMADD	f13, A10, A8, f13	LFPDUX	A10, BO2, INC4	bdnz+	.L22	.align 4.L23:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f4,  B1, A1, f4	LFPDUX	A8,  AO2, INC4	FXCPMADD	f8,  B2, A1, f8	FXCSMADD	f12, B2, A1, f12	FXCPMADD	f1,  B1, A2, f1	FXCSMADD	f5,  B1, A2, f5	FXCPMADD	f9,  B2, A2, f9	FXCSMADD	f13, B2, A2, f13	FXCPMADD	f0,  B3, A3, f0	FXCSMADD	f4,  B3, A3, f4	FXCPMADD	f8,  B4, A3, f8	FXCSMADD	f12, B4, A3, f12	FXCPMADD	f1,  B3, A4, f1	FXCSMADD	f5,  B3, A4, f5	FXCPMADD	f9,  B4, A4, f9	FXCSMADD	f13, B4, A4, f13	FXCPMADD	f0,  B5, A5, f0	FXCSMADD	f4,  B5, A5, f4	FXCPMADD	f8,  B6, A5, f8	FXCSMADD	f12, B6, A5, f12	FXCPMADD	f1,  B5, A6, f1	FXCSMADD	f5,  B5, A6, f5	FXCPMADD	f9,  B6, A6, f9	FXCSMADD	f13, B6, A6, f13	FXCPMADD	f0,  A9, A7, f0	FXCSMADD	f4,  A9, A7, f4	FXCPMADD	f8,  A10, A7, f8	FXCSMADD	f12, A10, A7, f12	FXCPMADD	f1,  A9, A8, f1	FXCSMADD	f5,  A9, A8, f5	FXCPMADD	f9,  A10, A8, f9	FXCSMADD	f13, A10, A8, f13	.align 4.L24:	li	r0, ALPHA	lfpdx	AP,  SP, r0#ifdef TRMMKERNEL	li	r0, FZERO	lfpsx	f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 2#endif	andi.	r0,  TEMP,  3	mtspr	CTR, r0#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	.L28	LFPDUX	A1,  AO,  INC4	LFPDUX	A2,  AO2, INC4	LFPDUX	B1,  BO,  INC4	LFPDUX	B2,  BO2, INC4	bdz-	.L27	.align 4.L26:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f4,  B1, A1, f4	FXCPMADD	f8,  B2, A1, f8	FXCSMADD	f12, B2, A1, f12	LFPDUX	A1,  AO,  INC4	FXCPMADD	f1,  B1, A2, f1	FXCSMADD	f5,  B1, A2, f5	LFPDUX	B1,  BO,  INC4	FXCPMADD	f9,  B2, A2, f9	FXCSMADD	f13, B2, A2, f13	LFPDUX	A2,  AO2, INC4	LFPDUX	B2,  BO2, INC4	bdnz+	.L26	.align 4.L27:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f4,  B1, A1, f4	FXCPMADD	f8,  B2, A1, f8	FXCSMADD	f12, B2, A1, f12	FXCPMADD	f1,  B1, A2, f1	FXCSMADD	f5,  B1, A2, f5	FXCPMADD	f9,  B2, A2, f9	FXCSMADD	f13, B2, A2, f13	.align 4.L28:#ifndef TRMMKERNEL	LFPDUX	A1, CO1, INC2	LFPDUX	A2, CO1, INC2	LFPDUX	A3, CO2, INC2	LFPDUX	A4, CO2, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \    defined(RN) || defined(RT) || defined(CN) || defined(CT)	fpadd	f0, f0, f4	fpadd	f8, f8, f12	fpadd	f1, f1, f5	fpadd	f9, f9, f13#else	fpsub	f0, f0, f4	fpsub	f8, f8, f12	fpsub	f1, f1, f5	fpsub	f9, f9, f13#endif#ifndef TRMMKERNEL	fxcpmadd A1,  f0, AP,  A1	fxcpmadd A2,  f1, AP,  A2	fxcpmadd A3,  f8, AP,  A3	fxcpmadd A4,  f9, AP,  A4	fxcxnpma f0,  f0, AP,  A1	fxcxnpma f1,  f1, AP,  A2	fxcxnpma f8,  f8, AP,  A3	fxcxnpma f9,  f9, AP,  A4	STFPDUX	f0,  CO1, INCM3	STFPDUX	f1,  CO1, INC2	STFPDUX	f8,  CO2, INCM3	STFPDUX	f9,  CO2, INC2#else	fxcpmadd f12,  f0, AP,  f30	fxcpmadd f13,  f1, AP,  f30	fxcpmadd f14,  f8, AP,  f30	fxcpmadd f15,  f9, AP,  f30	fxcxnpma f0,  f0, AP,  f12	fxcxnpma f1,  f1, AP,  f13	fxcxnpma f8,  f8, AP,  f14	fxcxnpma f9,  f9, AP,  f15	STFPDUX	f0,  CO1, INC2	STFPDUX	f1,  CO1, INC2	STFPDUX	f8,  CO2, INC2	STFPDUX	f9,  CO2, INC2#endif#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -2#else	addi	TEMP, TEMP, -2#endif	slwi	r0,   TEMP, 1 + ZBASE_SHIFT	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	li	r0, FZERO	lfpsx	f0, SP, r0	.align 4.L30:	andi.	I, M,  1	beq	.L49#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	addi	AO2, AO,   2 * SIZE	fpmr	f1,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f2,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f3, f0#else	slwi	TEMP, KK, 0 + ZBASE_SHIFT	slwi	r0,   KK, 1 + ZBASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	addi	AO2, AO,   2 * SIZE	fpmr	f1,  f0	addi	BO,  BO,  - 4 * SIZE	fpmr	f2,  f0	addi	BO2, BO,    2 * SIZE	fpmr	f3, f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 2#endif	srawi.	r0,  TEMP,  2	mtspr	CTR, r0	ble	.L34#else	addi	AO2, AO,   2 * SIZE	fpmr	f1,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f2,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f3, f0	srawi.	r0,  K,  2	mtspr	CTR, r0	ble	.L34#endif	LFPDUX	A1,  AO, INC4	LFPDUX	B1,  BO, INC4	LFPDUX	B2, BO2, INC4	LFPDUX	A2, AO2, INC4	LFPDUX	B3,  BO, INC4	LFPDUX	B4, BO2, INC4	LFPDUX	A3,  AO, INC4	LFPDUX	A5,  BO, INC4	LFPDUX	A6, BO2, INC4	LFPDUX	A4, AO2, INC4	LFPDUX	A7,  BO, INC4	LFPDUX	A8, BO2, INC4	bdz-	.L33	.align 4.L32:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	LFPDUX	B1,  BO, INC4	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	LFPDUX	B2, BO2, INC4	LFPDUX	A1,  AO, INC4	FXCPMADD	f0,  B3, A2, f0	FXCSMADD	f1,  B3, A2, f1	LFPDUX	B3,  BO, INC4	FXCPMADD	f2,  B4, A2, f2	FXCSMADD	f3,  B4, A2, f3	LFPDUX	B4, BO2, INC4	LFPDUX	A2, AO2, INC4	FXCPMADD	f0,  A5, A3, f0	FXCSMADD	f1,  A5, A3, f1	LFPDUX	A5,  BO, INC4	FXCPMADD	f2,  A6, A3, f2	FXCSMADD	f3,  A6, A3, f3	LFPDUX	A6, BO2, INC4	LFPDUX	A3,  AO, INC4	FXCPMADD	f0,  A7, A4, f0	FXCSMADD	f1,  A7, A4, f1	LFPDUX	A7,  BO, INC4	FXCPMADD	f2,  A8, A4, f2	FXCSMADD	f3,  A8, A4, f3	LFPDUX	A8, BO2, INC4	LFPDUX	A4, AO2, INC4	bdnz+	.L32	.align 4.L33:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	FXCPMADD	f0,  B3, A2, f0	FXCSMADD	f1,  B3, A2, f1	FXCPMADD	f2,  B4, A2, f2	FXCSMADD	f3,  B4, A2, f3	FXCPMADD	f0,  A5, A3, f0	FXCSMADD	f1,  A5, A3, f1	FXCPMADD	f2,  A6, A3, f2	FXCSMADD	f3,  A6, A3, f3	FXCPMADD	f0,  A7, A4, f0	FXCSMADD	f1,  A7, A4, f1	FXCPMADD	f2,  A8, A4, f2	FXCSMADD	f3,  A8, A4, f3	.align 4.L34:	li	r0, ALPHA	lfpdx	AP,  SP, r0#ifdef TRMMKERNEL	li	r0, FZERO	lfpsx	f30, SP, r0#endif#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 2#endif	andi.	r0,  TEMP,  3	mtspr	CTR, r0#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	.L38	LFPDX	A1,  AO,  INC4	LFPDUX	B1,  BO,  INC4	LFPDUX	B2,  BO2, INC4	add	AO, AO, INC2	bdz-	.L37	.align 4.L36:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	LFPDUX	B1,  BO,  INC4	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	LFPDX	A1,  AO,  INC4	LFPDUX	B2,  BO2, INC4	add	AO, AO, INC2	bdnz+	.L36	.align 4.L37:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	.align 4.L38:#ifndef TRMMKERNEL	LFPDX	A1, CO1, INC2	LFPDX	A2, CO2, INC2#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \    defined(RN) || defined(RT) || defined(CN) || defined(CT)	fpadd	f0, f0, f1	fpadd	f2, f2, f3#else	fpsub	f0, f0, f1	fpsub	f2, f2, f3#endif#ifndef TRMMKERNEL	fxcpmadd A1,  f0, AP,  A1	fxcpmadd A2,  f2, AP,  A2	fxcxnpma f0,  f0, AP,  A1	fxcxnpma f2,  f2, AP,  A2#else	fxcpmadd f12, f0, AP,  f30	fxcpmadd f13, f2, AP,  f30	fxcxnpma f0,  f0, AP,  f12	fxcxnpma f2,  f2, AP,  f13#endif	STFPDUX	f0,  CO1, INC2	STFPDUX	f2,  CO2, INC2#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -1#else	addi	TEMP, TEMP, -2#endif	slwi	r0,   TEMP, 0 + ZBASE_SHIFT	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 1#endif#endif	li	r0, FZERO	lfpsx	f0, SP, r0	.align 4.L49:#if defined(TRMMKERNEL) && !defined(LEFT)	addi	KK, KK, 2#endif	addi	B,  BO, 4 * SIZE	addic.	J, J, -1	bgt+	.L10	.align 4.L50:	andi.	J, N,  1	beq	.L999	mr	CO1, C#if defined(TRMMKERNEL) &&  defined(LEFT)	mr	KK, OFFSET#endif	addi	AO, A, -2 * SIZE		li	r0, FZERO	lfpsx	f0, SP, r0	srawi.	I, M,  2	ble	.L60	.align 4.L51:#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	fpmr	f4,  f0	addi	BO,  B,  - 2 * SIZE 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f2,  f0	fpmr	f6,  f0#else	slwi	TEMP, KK, 2 + ZBASE_SHIFT	slwi	r0,   KK, 0 + ZBASE_SHIFT	add	AO, AO, TEMP	add	BO, B,  r0	fpmr	f4,  f0	addi	BO,  BO,  - 2 * SIZE 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f2,  f0	fpmr	f6,  f0#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 4#else	addi	TEMP, KK, 1#endif	srawi.	r0,  TEMP,  2	fpmr	f3,  f0	mtspr	CTR, r0	fpmr	f7,  f0	ble	.L54#else	srawi.	r0,  K,  2	fpmr	f4,  f0	addi	BO,  B,  - 2 * SIZE 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f2,  f0	fpmr	f6,  f0	fpmr	f3,  f0	mtspr	CTR, r0	fpmr	f7,  f0	ble	.L54#endif	LFPDUX	B1,  BO,  INC2	LFPDUX	A1,  AO,  INC2	LFPDUX	A2,  AO,  INC2	LFPDUX	B2,  BO,  INC2	LFPDUX	A3,  AO,  INC2	LFPDUX	A4,  AO,  INC2	LFPDUX	B3,  BO,  INC2	LFPDUX	A5,  AO,  INC2	LFPDUX	A6,  AO,  INC2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -