⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	FMADD	f7,  f7, f30, f23	LFD	f16, 0 * SIZE(CO3)	LFD	f17, 1 * SIZE(CO3)	LFD	f18, 2 * SIZE(CO3)	LFD	f19, 3 * SIZE(CO3)	LFD	f20, 0 * SIZE(CO4)	LFD	f21, 1 * SIZE(CO4)	LFD	f22, 2 * SIZE(CO4)	LFD	f23, 3 * SIZE(CO4)	FMADD	f8,  f8,  f30, f16	FMADD	f9,  f9,  f30, f17	FMADD	f10, f10, f30, f18	FMADD	f11, f11, f30, f19	FMADD	f12, f12, f30, f20	FMADD	f13, f13, f30, f21	FMADD	f14, f14, f30, f22	FMADD	f15, f15, f30, f23#else	FMUL	f0,  f0, f30	FMUL	f1,  f1, f30	FMUL	f2,  f2, f30	FMUL	f3,  f3, f30	FMUL	f4,  f4, f30	FMUL	f5,  f5, f30	FMUL	f6,  f6, f30	FMUL	f7,  f7, f30	FMUL	f8,  f8,  f30	FMUL	f9,  f9,  f30	FMUL	f10, f10, f30	FMUL	f11, f11, f30	FMUL	f12, f12, f30	FMUL	f13, f13, f30	FMUL	f14, f14, f30	FMUL	f15, f15, f30#endif	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	STFD	f2,  2 * SIZE(CO1)	STFD	f3,  3 * SIZE(CO1)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	STFD	f4,  0 * SIZE(CO2)	STFD	f5,  1 * SIZE(CO2)	STFD	f6,  2 * SIZE(CO2)	STFD	f7,  3 * SIZE(CO2)	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	STFD	f8,  0 * SIZE(CO3)	STFD	f9,  1 * SIZE(CO3)	STFD	f10, 2 * SIZE(CO3)	STFD	f11, 3 * SIZE(CO3)	fmr	f8,  f0	fmr	f9,  f0	fmr	f10, f0	fmr	f11, f0	STFD	f12, 0 * SIZE(CO4)	STFD	f13, 1 * SIZE(CO4)	STFD	f14, 2 * SIZE(CO4)	STFD	f15, 3 * SIZE(CO4)	fmr	f12, f0	fmr	f13, f0	fmr	f14, f0	fmr	f15, f0	addi	CO1, CO1, 4 * SIZE	addi	CO2, CO2, 4 * SIZE	addi	CO3, CO3, 4 * SIZE	addi	CO4, CO4, 4 * SIZE	#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -4#else	addi	TEMP, TEMP, -4#endif	slwi	TEMP, TEMP, 2 + BASE_SHIFT	add	AO, AO, TEMP	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 4#endif#endif	addic.	I, I, -1	bgt+	LL(11)	.align 4LL(20):	andi.	I,  M,  2	ble	LL(30)#if defined(TRMMKERNEL)#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 1 + BASE_SHIFT	slwi	TEMP, KK, 2 + BASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)	LFD	f24,  4 * SIZE(BO)	LFD	f25,  5 * SIZE(BO)	LFD	f26,  6 * SIZE(BO)	LFD	f27,  7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 4#endif	srawi.	TEMP,  TEMP,  2	mtspr	CTR, TEMP#else	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	srawi.	r0,  K,  2	mtspr	CTR, r0	mr	BO,  B#endif	ble	LL(25)	.align 5LL(22):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f4,  f16, f21, f4	FMADD	f5,  f17, f21, f5	FMADD	f8,  f16, f22, f8	FMADD	f9,  f17, f22, f9	FMADD	f12, f16, f23, f12	FMADD	f13, f17, f23, f13	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	FMADD	f2,  f18, f24, f2	FMADD	f3,  f19, f24, f3	FMADD	f6,  f18, f25, f6	FMADD	f7,  f19, f25, f7	FMADD	f10, f18, f26, f10	FMADD	f11, f19, f26, f11	FMADD	f14, f18, f27, f14	FMADD	f15, f19, f27, f15	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f4,  f16, f21, f4	FMADD	f5,  f17, f21, f5	LFD	f24, 12 * SIZE(BO)	LFD	f25, 13 * SIZE(BO)	LFD	f26, 14 * SIZE(BO)	LFD	f27, 15 * SIZE(BO)	FMADD	f8,  f16, f22, f8	FMADD	f9,  f17, f22, f9	FMADD	f12, f16, f23, f12	FMADD	f13, f17, f23, f13	LFD	f20, 16 * SIZE(BO)	LFD	f21, 17 * SIZE(BO)	LFD	f22, 18 * SIZE(BO)	LFD	f23, 19 * SIZE(BO)	FMADD	f2,  f18, f24, f2	FMADD	f3,  f19, f24, f3	FMADD	f6,  f18, f25, f6	FMADD	f7,  f19, f25, f7	FMADD	f10, f18, f26, f10	FMADD	f11, f19, f26, f11	FMADD	f14, f18, f27, f14	FMADD	f15, f19, f27, f15	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO)	LFD	f24, 20 * SIZE(BO)	LFD	f25, 21 * SIZE(BO)	LFD	f26, 22 * SIZE(BO)	LFD	f27, 23 * SIZE(BO)	addi	AO, AO,  8 * SIZE	addi	BO, BO, 16 * SIZE	PREFETCH_B	bdnz	LL(22)	fadd	f0,  f2,  f0	fadd	f1,  f3,  f1	fadd	f4,  f6,  f4	fadd	f5,  f7,  f5	fadd	f8,  f10, f8	fadd	f9,  f11, f9	fadd	f12, f14, f12	fadd	f13, f15, f13	.align 4LL(25):	lfd	f30,  ALPHA#if   defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 4#endif	andi.	TEMP,  TEMP,  3	mtspr	CTR, TEMP#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	LL(28)	.align 4LL(26):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f4,  f16, f21, f4	FMADD	f5,  f17, f21, f5	FMADD	f8,  f16, f22, f8	FMADD	f9,  f17, f22, f9	FMADD	f12, f16, f23, f12	FMADD	f13, f17, f23, f13	LFD	f16,  2 * SIZE(AO)	LFD	f17,  3 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	addi	BO, BO,  4 * SIZE	addi	AO, AO,  2 * SIZE	bdnz	LL(26)	.align 4LL(28):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 0 * SIZE(CO2)	LFD	f19, 1 * SIZE(CO2)	FMADD	f0,  f0, f30, f16	FMADD	f1,  f1, f30, f17	FMADD	f4,  f4, f30, f18	FMADD	f5,  f5, f30, f19	LFD	f20, 0 * SIZE(CO3)	LFD	f21, 1 * SIZE(CO3)	LFD	f22, 0 * SIZE(CO4)	LFD	f23, 1 * SIZE(CO4)	FMADD	f8,  f8,  f30, f20	FMADD	f9,  f9,  f30, f21	FMADD	f12, f12, f30, f22	FMADD	f13, f13, f30, f23#else	FMUL	f0,  f0, f30	FMUL	f1,  f1, f30	FMUL	f4,  f4, f30	FMUL	f5,  f5, f30	FMUL	f8,  f8,  f30	FMUL	f9,  f9,  f30	FMUL	f12, f12, f30	FMUL	f13, f13, f30#endif	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	STFD	f4,  0 * SIZE(CO2)	STFD	f5,  1 * SIZE(CO2)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	STFD	f8,  0 * SIZE(CO3)	STFD	f9,  1 * SIZE(CO3)	STFD	f12, 0 * SIZE(CO4)	STFD	f13, 1 * SIZE(CO4)	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	fmr	f8,  f0	fmr	f9,  f0	fmr	f10, f0	fmr	f11, f0	fmr	f12, f0	fmr	f13, f0	fmr	f14, f0	fmr	f15, f0	addi	CO1, CO1, 2 * SIZE	addi	CO2, CO2, 2 * SIZE	addi	CO3, CO3, 2 * SIZE	addi	CO4, CO4, 2 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -2#else	addi	TEMP, TEMP, -4#endif	slwi	r0,   TEMP, 1 + BASE_SHIFT	slwi	TEMP, TEMP, 2 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	.align 4LL(30):	andi.	I,  M,  1	ble	LL(39)#if   defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 0 + BASE_SHIFT	slwi	TEMP, KK, 2 + BASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)	LFD	f24,  4 * SIZE(BO)	LFD	f25,  5 * SIZE(BO)	LFD	f26,  6 * SIZE(BO)	LFD	f27,  7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 4#endif	srawi.	TEMP,  TEMP,  2	mtspr	CTR, TEMP#else	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	srawi.	r0,  K,  2	mtspr	CTR, r0	mr	BO,  B#endif	ble	LL(35)	.align 5LL(32):	FMADD	f0,  f16, f20, f0	FMADD	f4,  f16, f21, f4	FMADD	f8,  f16, f22, f8	FMADD	f12, f16, f23, f12	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	FMADD	f1,  f17, f24, f1	FMADD	f5,  f17, f25, f5	FMADD	f9,  f17, f26, f9	FMADD	f13, f17, f27, f13	LFD	f24, 12 * SIZE(BO)	LFD	f25, 13 * SIZE(BO)	LFD	f26, 14 * SIZE(BO)	LFD	f27, 15 * SIZE(BO)	FMADD	f0,  f18, f20, f0	FMADD	f4,  f18, f21, f4	FMADD	f8,  f18, f22, f8	FMADD	f12, f18, f23, f12	LFD	f20, 16 * SIZE(BO)	LFD	f21, 17 * SIZE(BO)	LFD	f22, 18 * SIZE(BO)	LFD	f23, 19 * SIZE(BO)	FMADD	f1,  f19, f24, f1	FMADD	f5,  f19, f25, f5	FMADD	f9,  f19, f26, f9	FMADD	f13, f19, f27, f13	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	LFD	f24, 20 * SIZE(BO)	LFD	f25, 21 * SIZE(BO)	LFD	f26, 22 * SIZE(BO)	LFD	f27, 23 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO, 16 * SIZE	PREFETCH_B	bdnz	LL(32)	fadd	f0,  f1,   f0	fadd	f4,  f5,   f4	fadd	f8,  f9,   f8	fadd	f12, f13, f12	.align 4LL(35):	lfd	f30,  ALPHA#if  defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 4#endif	andi.	TEMP,  TEMP,  3	mtspr	CTR, TEMP#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	LL(38)	.align 4LL(36):	FMADD	f0,  f16, f20, f0	FMADD	f4,  f16, f21, f4	FMADD	f8,  f16, f22, f8	FMADD	f12, f16, f23, f12	LFD	f16,  1 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	addi	BO, BO,  4 * SIZE	addi	AO, AO,  1 * SIZE	bdnz	LL(36)	.align 4LL(38):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f18, 0 * SIZE(CO2)	LFD	f20, 0 * SIZE(CO3)	LFD	f22, 0 * SIZE(CO4)	FMADD	f0,  f0,  f30, f16	FMADD	f4,  f4,  f30, f18	FMADD	f8,  f8,  f30, f20	FMADD	f12, f12, f30, f22#else	FMUL	f0,  f0,  f30	FMUL	f4,  f4,  f30	FMUL	f8,  f8,  f30	FMUL	f12, f12, f30#endif	STFD	f0,  0 * SIZE(CO1)	STFD	f4,  0 * SIZE(CO2)	STFD	f8,  0 * SIZE(CO3)	STFD	f12, 0 * SIZE(CO4)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f4,  f0	fmr	f5,  f0	fmr	f8,  f0	fmr	f9,  f0	fmr	f12, f0	fmr	f13, f0#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -1#else	addi	TEMP, TEMP, -4#endif	slwi	r0,   TEMP, 0 + BASE_SHIFT	slwi	TEMP, TEMP, 2 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	.align 4LL(39):#if defined(TRMMKERNEL) && !defined(LEFT)	addi	KK, KK, 4#endif	mr	B,  BO	addic.	J, J, -1	bgt	LL(10)	.align 4LL(40):	mr	CO1, C	add	CO2, C,  LDC	andi.	J, N,  2	ble	LL(70)#if defined(TRMMKERNEL) && defined(LEFT)	mr	KK, OFFSET#endif	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	fmr	f4,  f0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -