⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_altivec.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	mr	CO1, C	mr	AO, A	srawi.	I, M,  3	ble	LL(70)	.align 4LL(61):	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	vxor	c03, c03, c03	LOAD_A	a1, OFFSET_0, AO	vxor	c04, c04, c04	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c07, c07, c07	vxor	c08, c08, c08	mr	BO, B	dcbtst	CO1, PREC	dcbtst	CO2, PREC	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(65)	.align 4LL(62):	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	vmaddfp	c01, a5, bp1, c01	vspltw	bp2, b1, 3	vmaddfp	c02, a6, bp1, c02	vmaddfp	c03, a7, bp1, c03	vmaddfp	c04, a8, bp1, c04	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c05, a5, bp2, c05	vmaddfp	c06, a6, bp2, c06	vmaddfp	c07, a7, bp2, c07	vmaddfp	c08, a8, bp2, c08	addi	AO, AO, 32 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	bdnz	LL(62)	.align 4LL(65):	andi.	r0,  K,  1	ble+	LL(68)	.align 4LL(66):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	addi	AO, AO, 16 * SIZE	vmaddfp	c03, a3, bp1, c03	addi	BO, BO,  2 * SIZE	vmaddfp	c04, a4, bp1, c04	nop	vmaddfp	c05, a1, bp2, c05	vmaddfp	c06, a2, bp2, c06	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	.align 4LL(68):	vxor	VZERO, VZERO, VZERO	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	vperm	c05, c05, c05, swap	vperm	c06, c06, c06, swap	vperm	c07, c07, c07, swap	vperm	c08, c08, c08, swap	vxor	c05, c05, neg	vxor	c06, c06, neg	vxor	c07, c07, neg	vxor	c08, c08, neg	vaddfp	c01, c01, c05	vaddfp	c02, c02, c06	vaddfp	c03, c03, c07	vaddfp	c04, c04, c08	vperm	c05, c01, c01, swap	vperm	c06, c02, c02, swap	vperm	c07, c03, c03, swap	vperm	c08, c04, c04, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c02, alpha_r, c02, VZERO	vmaddfp	c03, alpha_r, c03, VZERO	vmaddfp	c04, alpha_r, c04, VZERO	vmaddfp	c01, alpha_i, c05, c01	vmaddfp	c02, alpha_i, c06, c02	vmaddfp	c03, alpha_i, c07, c03	vmaddfp	c04, alpha_i, c08, c04	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvx	C4, OFFSET_3, CO1	lvx	C5, OFFSET_4, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	vaddfp	c02, c02, C3	vaddfp	c03, c03, C4	vaddfp	c04, c04, C5	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	stvx	c03, OFFSET_3, CO1	stvx	c04, OFFSET_4, CO1	addi	CO1, CO1, 16 * SIZE	addic.	I, I, -1	bgt+	LL(61)	.align 4LL(70):	andi.	I, M,  4	ble	LL(80)	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	vxor	c03, c03, c03	LOAD_A	a1, OFFSET_0, AO	vxor	c04, c04, c04	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c07, c07, c07	vxor	c08, c08, c08	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(75)	.align 4LL(72):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c03, a3, bp1, c03	vspltw	bp2, b1, 3	vmaddfp	c04, a4, bp1, c04	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	addi	AO, AO, 16 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	bdnz	LL(72)	.align 4LL(75):	andi.	r0,  K,  1	ble+	LL(78)	.align 4LL(76):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	addi	AO, AO,  8 * SIZE	vmaddfp	c05, a1, bp2, c05	addi	BO, BO,  2 * SIZE	vmaddfp	c06, a2, bp2, c06	.align 4LL(78):	vaddfp	c01, c01, c03	vaddfp	c02, c02, c04	vaddfp	c05, c05, c07	vaddfp	c06, c06, c08	vxor	VZERO, VZERO, VZERO	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	vperm	c05, c05, c05, swap	vperm	c06, c06, c06, swap	vxor	c05, c05, neg	vxor	c06, c06, neg	vaddfp	c01, c01, c05	vaddfp	c02, c02, c06	vperm	c05, c01, c01, swap	vperm	c06, c02, c02, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c02, alpha_r, c02, VZERO	vmaddfp	c01, alpha_i, c05, c01	vmaddfp	c02, alpha_i, c06, c02	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02, VZERO,   PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	vaddfp	c02, c02, C3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	addi	CO1, CO1,  8 * SIZE	.align 4LL(80):	andi.	I, M,  2	ble	LL(90)	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	vxor	c06, c06, c06	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(85)	.align 4LL(82):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b1, 3	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c06, a2, bp2, c06	addi	AO, AO,  8 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	bdnz	LL(82)	.align 4LL(85):	andi.	r0,  K,  1	ble+	LL(88)	.align 4LL(86):	vspltw	bp2, b1, 1	vmaddfp	c01, a1, bp1, c01	vmaddfp	c05, a1, bp2, c05	addi	AO, AO,  4 * SIZE	addi	BO, BO,  2 * SIZE	.align 4LL(88):	vaddfp	c01, c01, c02	vaddfp	c05, c05, c06	vaddfp	c09, c09, c10	vaddfp	c13, c13, c14	vxor	VZERO, VZERO, VZERO	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	vperm	c05, c05, c05, swap	vxor	c05, c05, neg	vaddfp	c01, c01, c05	vperm	c05, c01, c01, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c01, alpha_i, c05, c01	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01, VZERO,   PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	addi	CO1, CO1,  4 * SIZE	.align 4LL(90):	andi.	I, M,  1	ble	LL(999)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(BO)	LFD	f11,  1 * SIZE(BO)	LFD	f12,  2 * SIZE(BO)	LFD	f13,  3 * SIZE(BO)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(95)	.align 4LL(92):	fmadd	f0,  f8, f10, f0	fmadd	f2,  f8, f11, f2	fmadd	f1,  f9, f10, f1	fmadd	f3,  f9, f11, f3	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	fmadd	f0,  f8, f12, f0	fmadd	f2,  f8, f13, f2	fmadd	f1,  f9, f12, f1	fmadd	f3,  f9, f13, f3	LFD	f8,   4 * SIZE(AO)	LFD	f9,   5 * SIZE(AO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO,  4 * SIZE	bdnz	LL(92)	.align 4LL(95):	andi.	r0,  K,  1	ble	LL(98)	.align 4LL(96):	fmadd	f0,  f8, f10, f0	fmadd	f2,  f8, f11, f2	fmadd	f1,  f9, f10, f1	fmadd	f3,  f9, f11, f3	.align 4LL(98):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	fsub	f0, f0, f3	fadd	f1, f1, f2#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)	fadd	f0, f0, f3	fsub	f1, f1, f2#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)	fadd	f0, f0, f3	fsub	f1, f2, f1#else /* RR, RC, CR, CC */	fsub	f0, f0, f3	fadd	f1, f1, f2#endif	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  1 * SIZE(CO1)	lfs	f12,  ALPHA_R + 0(SP)	lfs	f13,  ALPHA_I + 4(SP)#if defined(RR) || defined(RC) || defined(CR) || defined(CC)	fmadd	f8,  f12, f0, f8	fnmsub	f9,  f12, f1, f9	fmadd	f8,  f13, f1, f8	fmadd	f9,  f13, f0, f9#else	fmadd	f8,  f12, f0, f8	fmadd	f9,  f12, f1, f9	fnmsub	f8,  f13, f1, f8	fmadd	f9,  f13, f0, f9#endif	STFD	f8,  0 * SIZE(CO1)	STFD	f9,  1 * SIZE(CO1)	.align 4	LL(999):	mr	SP, STACK	li	r0,  0 * 16	lvx	v20, SP, r0	li	r0,  1 * 16	lvx	v21, SP, r0	li	r0,  2 * 16	lvx	v22, SP, r0	li	r0,  3 * 16	lvx	v23, SP, r0	li	r0,  4 * 16	lvx	v24, SP, r0	li	r0,  5 * 16	lvx	v25, SP, r0	li	r0,  6 * 16	lvx	v26, SP, r0	li	r0,  7 * 16	lvx	v27, SP, r0	li	r0,  8 * 16	lvx	v28, SP, r0	li	r0,  9 * 16	lvx	v29, SP, r0	li	r0, 10 * 16	lvx	v30, SP, r0	li	r0, 11 * 16	lvx	v31, SP, r0	mtspr	VRsave, VREG#ifdef __64BIT__	ld	r31,  192(SP)	ld	r30,  200(SP)	ld	r29,  208(SP)	ld	r28,  216(SP)	ld	r27,  224(SP)	ld	r26,  232(SP)	ld	r25,  240(SP)	ld	r24,  248(SP)	ld	r23,  256(SP)	ld	r22,  264(SP)	ld	r21,  272(SP)	ld	r20,  280(SP)	ld	r19,  288(SP)	ld	r18,  296(SP)	ld	r17,  304(SP)	ld	r16,  312(SP)	ld	r15,  320(SP)	ld	r14,  328(SP)#else	lwz	r31,  192(SP)	lwz	r30,  196(SP)	lwz	r29,  200(SP)	lwz	r28,  204(SP)	lwz	r27,  208(SP)	lwz	r26,  212(SP)	lwz	r25,  216(SP)	lwz	r24,  220(SP)	lwz	r23,  224(SP)	lwz	r22,  228(SP)	lwz	r21,  232(SP)	lwz	r20,  236(SP)	lwz	r19,  240(SP)	lwz	r18,  244(SP)	lwz	r17,  248(SP)	lwz	r16,  252(SP)	lwz	r15,  256(SP)	lwz	r14,  260(SP)#endif	addi	SP, SP, STACKSIZE	blr	EPILOGUE#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -