⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  0 * SIZE(CO2)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f4,  f0	fmr	f5,  f0#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -1#else	addi	TEMP, TEMP, -2#endif	slwi	r0,   TEMP, 0 + BASE_SHIFT	slwi	TEMP, TEMP, 1 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 1#endif#endif	.align 4LL(69):#if defined(TRMMKERNEL) && !defined(LEFT)	addi	KK, KK, 2#endif	mr	B,  BO	.align 4LL(70):	mr	CO1, C	andi.	J, N,  1	ble	LL(999)#if defined(TRMMKERNEL) && defined(LEFT)	mr	KK, OFFSET#endif	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	srawi.	I, M,  2	mr	AO, A	ble	LL(80)	.align 4LL(71):#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 2 + BASE_SHIFT	slwi	TEMP, KK, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)#endif	dcbt	CO1, PREC#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 4#else	addi	TEMP, KK, 1#endif	srawi.	TEMP,  TEMP,  2	mtspr	CTR, TEMP#else	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	dcbt	CO1, PREC	srawi.	r0,  K,  2	mtspr	CTR, r0	mr	BO,  B#endif	ble	LL(75)	.align 5LL(72):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f2,  f18, f20, f2	FMADD	f3,  f19, f20, f3	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	FMADD	f0,  f16, f21, f0	FMADD	f1,  f17, f21, f1	FMADD	f2,  f18, f21, f2	FMADD	f3,  f19, f21, f3	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO)	FMADD	f0,  f16, f22, f0	FMADD	f1,  f17, f22, f1	FMADD	f2,  f18, f22, f2	FMADD	f3,  f19, f22, f3	LFD	f16, 12 * SIZE(AO)	LFD	f17, 13 * SIZE(AO)	LFD	f18, 14 * SIZE(AO)	LFD	f19, 15 * SIZE(AO)	FMADD	f0,  f16, f23, f0	FMADD	f1,  f17, f23, f1	FMADD	f2,  f18, f23, f2	FMADD	f3,  f19, f23, f3	LFD	f16, 16 * SIZE(AO)	LFD	f17, 17 * SIZE(AO)	LFD	f18, 18 * SIZE(AO)	LFD	f19, 19 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	addi	AO, AO, 16 * SIZE	addi	BO, BO,  4 * SIZE	PREFETCH_B	bdnz	LL(72)	.align 4LL(75):	lfd	f30,  ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 4#else	addi	TEMP, KK, 1#endif	andi.	TEMP,  TEMP,  3	mtspr	CTR, TEMP#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	LL(78)	.align 4LL(76):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f2,  f18, f20, f2	FMADD	f3,  f19, f20, f3	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	LFD	f20,  1 * SIZE(BO)	addi	BO, BO,  1 * SIZE	addi	AO, AO,  4 * SIZE	bdnz	LL(76)	.align 4LL(78):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 2 * SIZE(CO1)	LFD	f19, 3 * SIZE(CO1)	FMADD	f0,  f0, f30, f16	FMADD	f1,  f1, f30, f17	FMADD	f2,  f2, f30, f18	FMADD	f3,  f3, f30, f19#else	FMUL	f0,  f0, f30	FMUL	f1,  f1, f30	FMUL	f2,  f2, f30	FMUL	f3,  f3, f30#endif	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	STFD	f2,  2 * SIZE(CO1)	STFD	f3,  3 * SIZE(CO1)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0 #ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -4#else	addi	TEMP, TEMP, -1#endif	slwi	r0  , TEMP, 2 + BASE_SHIFT	slwi	TEMP, TEMP, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 4#endif#endif	addi	CO1, CO1, 4 * SIZE	addic.	I, I, -1	bgt+	LL(71)	.align 4LL(80):	andi.	I,  M,  2	ble	LL(90)#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 1 + BASE_SHIFT	slwi	TEMP, KK, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 1#endif	srawi.	TEMP,  TEMP,  2	mtspr	CTR, TEMP#else	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	srawi.	r0,  K,  2	mtspr	CTR, r0	mr	BO,  B#endif	ble	LL(85)	.align 5LL(82):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	FMADD	f2,  f18, f21, f2	FMADD	f3,  f19, f21, f3	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	FMADD	f0,  f16, f22, f0	FMADD	f1,  f17, f22, f1	FMADD	f2,  f18, f23, f2	FMADD	f3,  f19, f23, f3	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	addi	AO, AO,  8 * SIZE	addi	BO, BO,  4 * SIZE	PREFETCH_B	bdnz	LL(82)	.align 4LL(85):	lfd	f30,  ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 2#else	addi	TEMP, KK, 1#endif	andi.	TEMP,  TEMP,  3	mtspr	CTR, TEMP#else	andi.	r0,  K,  3	mtspr	CTR, r0#endif	ble+	LL(88)	.align 4LL(86):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f20, f1	LFD	f16,  2 * SIZE(AO)	LFD	f17,  3 * SIZE(AO)	LFD	f20,  1 * SIZE(BO)	addi	BO, BO,  1 * SIZE	addi	AO, AO,  2 * SIZE	bdnz	LL(86)	.align 4LL(88):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	FADD	f0, f2, f0	FADD	f1, f3, f1	FMADD	f0,  f0, f30, f16	FMADD	f1,  f1, f30, f17#else	FADD	f0, f2, f0	FADD	f1, f3, f1	FMUL	f0,  f0, f30	FMUL	f1,  f1, f30#endif	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	addi	CO1, CO1, 2 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -2#else	addi	TEMP, TEMP, -1#endif	slwi	r0  , TEMP, 1 + BASE_SHIFT	slwi	TEMP, TEMP, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	.align 4LL(90):	andi.	I,  M,  1	ble	LL(999)#if defined(TRMMKERNEL)#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 0 + BASE_SHIFT	slwi	TEMP, KK, 0 + BASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 1#endif	srawi.	TEMP,  TEMP,  3	mtspr	CTR, TEMP#else	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	srawi.	r0,  K,  3	mtspr	CTR, r0	mr	BO,  B#endif	ble	LL(95)	.align 5LL(92):	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f21, f1	FMADD	f2,  f18, f22, f2	FMADD	f3,  f19, f23, f3	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	FMADD	f0,  f16, f20, f0	FMADD	f1,  f17, f21, f1	FMADD	f2,  f18, f22, f2	FMADD	f3,  f19, f23, f3	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO)	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	addi	AO, AO,  8 * SIZE	addi	BO, BO,  8 * SIZE	bdnz	LL(92)	.align 4LL(95):	lfd	f30,  ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 1#endif	andi.	TEMP,  TEMP,  7	mtspr	CTR, TEMP#else	andi.	r0,  K,  7	mtspr	CTR, r0#endif	ble+	LL(98)	.align 4LL(96):	FMADD	f0,  f16, f20, f0	LFD	f16,  1 * SIZE(AO)	LFD	f20,  1 * SIZE(BO)	addi	BO, BO,  1 * SIZE	addi	AO, AO,  1 * SIZE	bdnz	LL(96)	.align 4LL(98):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	FADD	f0, f1, f0	FADD	f2, f3, f2	FADD	f0, f2, f0	FMADD	f0,  f0,  f30, f16#else	FADD	f0, f1, f0	FADD	f2, f3, f2	FADD	f0, f2, f0	FMUL	f0,  f0,  f30#endif	STFD	f0,  0 * SIZE(CO1)	.align 4LL(999):	addi	r3, 0, 0	lfd	f14,    0(SP)	lfd	f15,    8(SP)	lfd	f16,   16(SP)	lfd	f17,   24(SP)	lfd	f18,   32(SP)	lfd	f19,   40(SP)	lfd	f20,   48(SP)	lfd	f21,   56(SP)	lfd	f22,   64(SP)	lfd	f23,   72(SP)	lfd	f24,   80(SP)	lfd	f25,   88(SP)	lfd	f26,   96(SP)	lfd	f27,  104(SP)	lfd	f28,  112(SP)	lfd	f29,  120(SP)	lfd	f30,  128(SP)	lfd	f31,  136(SP)#ifdef __64BIT__	ld	r31,  144(SP)	ld	r30,  152(SP)	ld	r29,  160(SP)	ld	r28,  168(SP)	ld	r27,  176(SP)	ld	r26,  184(SP)	ld	r25,  192(SP)	ld	r24,  200(SP)	ld	r23,  208(SP)	ld	r22,  216(SP)	ld	r21,  224(SP)	ld	r20,  232(SP)#if defined(TRMMKERNEL) || defined(TRSMKERNEL)	ld	r19,  240(SP)	ld	r18,  248(SP)#endif#else	lwz	r31,  144(SP)	lwz	r30,  148(SP)	lwz	r29,  152(SP)	lwz	r28,  156(SP)	lwz	r27,  160(SP)	lwz	r26,  164(SP)	lwz	r25,  168(SP)	lwz	r24,  172(SP)	lwz	r23,  176(SP)	lwz	r22,  180(SP)	lwz	r21,  184(SP)	lwz	r20,  188(SP)#if defined(TRMMKERNEL) || defined(TRSMKERNEL)	lwz	r19,  192(SP)	lwz	r18,  196(SP)#endif#endif	addi	SP, SP, STACKSIZE	blr	EPILOGUE#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -