⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_altivec.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	vxor	c14, c14, neg	vxor	c15, c15, neg	vxor	c16, c16, neg	vaddfp	c01, c01, c05	vaddfp	c02, c02, c06	vaddfp	c03, c03, c07	vaddfp	c04, c04, c08	vaddfp	c09, c09, c13	vaddfp	c10, c10, c14	vaddfp	c11, c11, c15	vaddfp	c12, c12, c16	vperm	c05, c01, c01, swap	vperm	c06, c02, c02, swap	vperm	c07, c03, c03, swap	vperm	c08, c04, c04, swap	vperm	c13, c09, c09, swap	vperm	c14, c10, c10, swap	vperm	c15, c11, c11, swap	vperm	c16, c12, c12, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c02, alpha_r, c02, VZERO	vmaddfp	c03, alpha_r, c03, VZERO	vmaddfp	c04, alpha_r, c04, VZERO	vmaddfp	c01, alpha_i, c05, c01	vmaddfp	c02, alpha_i, c06, c02	vmaddfp	c03, alpha_i, c07, c03	vmaddfp	c04, alpha_i, c08, c04	vmaddfp	c09, alpha_r, c09, VZERO	vmaddfp	c10, alpha_r, c10, VZERO	vmaddfp	c11, alpha_r, c11, VZERO	vmaddfp	c12, alpha_r, c12, VZERO	vmaddfp	c09, alpha_i, c13, c09	vmaddfp	c10, alpha_i, c14, c10	vmaddfp	c11, alpha_i, c15, c11	vmaddfp	c12, alpha_i, c16, c12	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvx	C4, OFFSET_3, CO1	lvx	C5, OFFSET_4, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	vaddfp	c02, c02, C3	vaddfp	c03, c03, C4	vaddfp	c04, c04, C5	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	stvx	c03, OFFSET_3, CO1	stvx	c04, OFFSET_4, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	lvx	C3, OFFSET_2, CO2	lvx	C4, OFFSET_3, CO2	lvx	C5, OFFSET_4, CO2	vperm	c00, VZERO, c09,   PERMRSHIFT2	vperm	c09, c09,   c10,   PERMRSHIFT2	vperm	c10, c10,   c11,   PERMRSHIFT2	vperm	c11, c11,   c12,   PERMRSHIFT2	vperm	c12, c12,   VZERO, PERMRSHIFT2	vaddfp	c00, c00, C1	vaddfp	c09, c09, C2	vaddfp	c10, c10, C3	vaddfp	c11, c11, C4	vaddfp	c12, c12, C5	stvx	c00, OFFSET_0, CO2	stvx	c09, OFFSET_1, CO2	stvx	c10, OFFSET_2, CO2	stvx	c11, OFFSET_3, CO2	stvx	c12, OFFSET_4, CO2	addi	CO1, CO1, 16 * SIZE	addi	CO2, CO2, 16 * SIZE	addic.	I, I, -1	bgt+	LL(11)	.align 4LL(20):	andi.	I, M,  4	ble	LL(30)	vxor	c01, c01, c01	LOAD_A	a1, OFFSET_0, AO	vxor	c02, c02, c02	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c09, c09, c09	LOAD_B	b1, OFFSET_0, B	vxor	c10, c10, c10	LOAD_B	b2, OFFSET_1, B	vxor	c13, c13, c13	vxor	c14, c14, c14	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(25)	.align 4LL(22):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	addi	AO, AO, 16 * SIZE	vmaddfp	c02, a2, bp1, c02	addi	BO, BO,  8 * SIZE	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c10, a2, bp1, c10	vmaddfp	c13, a1, bp2, c13	LOAD_A	a1, OFFSET_0, AO	vspltw	bp1, b2, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c01, a3, bp1, c01	vspltw	bp2, b2, 1	vmaddfp	c02, a4, bp1, c02	vmaddfp	c05, a3, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a4, bp2, c06	vmaddfp	c09, a3, bp1, c09	vspltw	bp2, b2, 3	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c10, a4, bp1, c10	vmaddfp	c13, a3, bp2, c13	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c14, a4, bp2, c14	LOAD_A	a4, OFFSET_3, AO	vspltw	bp1, b1, 0	bdnz	LL(22)	.align 4LL(25):	andi.	r0,  K,  1	ble+	LL(28)	.align 4LL(26):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	nop	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	nop	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	addi	AO, AO,  8 * SIZE	vmaddfp	c13, a1, bp2, c13	addi	BO, BO,  4 * SIZE	vmaddfp	c14, a2, bp2, c14	nop	.align 4LL(28):	vxor	VZERO, VZERO, VZERO	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	vperm	c05, c05, c05, swap	vperm	c06, c06, c06, swap	vperm	c13, c13, c13, swap	vperm	c14, c14, c14, swap	vxor	c05, c05, neg	vxor	c06, c06, neg	vxor	c13, c13, neg	vxor	c14, c14, neg	vaddfp	c01, c01, c05	vaddfp	c02, c02, c06	vaddfp	c09, c09, c13	vaddfp	c10, c10, c14	vperm	c05, c01, c01, swap	vperm	c06, c02, c02, swap	vperm	c13, c09, c09, swap	vperm	c14, c10, c10, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c02, alpha_r, c02, VZERO	vmaddfp	c01, alpha_i, c05, c01	vmaddfp	c02, alpha_i, c06, c02	vmaddfp	c09, alpha_r, c09, VZERO	vmaddfp	c10, alpha_r, c10, VZERO	vmaddfp	c09, alpha_i, c13, c09	vmaddfp	c10, alpha_i, c14, c10	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02, VZERO,   PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	vaddfp	c02, c02, C3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	lvx	C3, OFFSET_2, CO2	vperm	c00, VZERO, c09,   PERMRSHIFT2	vperm	c09, c09,   c10,   PERMRSHIFT2	vperm	c10, c10,   VZERO, PERMRSHIFT2	vaddfp	c00, c00, C1	vaddfp	c09, c09, C2	vaddfp	c10, c10, C3	stvx	c00, OFFSET_0, CO2	stvx	c09, OFFSET_1, CO2	stvx	c10, OFFSET_2, CO2	addi	CO1, CO1, 8 * SIZE	addi	CO2, CO2, 8 * SIZE	.align 4LL(30):	andi.	I, M,  2	ble	LL(40)	vxor	c01, c01, c01	LOAD_A	a1, OFFSET_0, AO	vxor	c02, c02, c02	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_B	b1, OFFSET_0, B	vxor	c06, c06, c06	LOAD_B	b2, OFFSET_1, B	vxor	c09, c09, c09	vxor	c10, c10, c10	vxor	c13, c13, c13	vxor	c14, c14, c14	vspltw	bp1, b1, 0	mr	BO, B	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(35)	.align 4LL(32):	vmaddfp	c01, a1, bp1, c01	addi	AO, AO,  8 * SIZE	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	addi	BO, BO,  8 * SIZE	vspltw	bp1, b1, 2	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c13, a1, bp2, c13	LOAD_A	a1, OFFSET_0, AO	vspltw	bp1, b2, 0	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b2, 1	vmaddfp	c06, a2, bp2, c06	vspltw	bp1, b2, 2	vmaddfp	c10, a2, bp1, c10	vspltw	bp2, b2, 3	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c14, a2, bp2, c14	LOAD_A	a2, OFFSET_1, AO	vspltw	bp1, b1, 0	bdnz	LL(32)	.align 4LL(35):	andi.	r0,  K,  1	ble+	LL(38)	.align 4LL(36):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c13, a1, bp2, c13	addi	AO, AO,  4 * SIZE	addi	BO, BO,  4 * SIZE	.align 4LL(38):	vaddfp	c01, c01, c02	vaddfp	c05, c05, c06	vaddfp	c09, c09, c10	vaddfp	c13, c13, c14	vxor	VZERO, VZERO, VZERO	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	vperm	c05, c05, c05, swap	vperm	c13, c13, c13, swap	vxor	c05, c05, neg	vxor	c13, c13, neg	vaddfp	c01, c01, c05	vaddfp	c09, c09, c13	vperm	c05, c01, c01, swap	vperm	c13, c09, c09, swap	vmaddfp	c01, alpha_r, c01, VZERO	vmaddfp	c01, alpha_i, c05, c01	vmaddfp	c09, alpha_r, c09, VZERO	vmaddfp	c09, alpha_i, c13, c09	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01, VZERO,   PERMRSHIFT1	vaddfp	c00, c00, C1	vaddfp	c01, c01, C2	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	vperm	c00, VZERO, c09,   PERMRSHIFT2	vperm	c09, c09,   VZERO, PERMRSHIFT2	vaddfp	c00, c00, C1	vaddfp	c09, c09, C2	stvx	c00, OFFSET_0, CO2	stvx	c09, OFFSET_1, CO2	addi	CO1, CO1,  4 * SIZE	addi	CO2, CO2,  4 * SIZE	.align 4LL(40):	andi.	I, M,  1	ble	LL(49)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(BO)	LFD	f11,  1 * SIZE(BO)	LFD	f12,  2 * SIZE(BO)	LFD	f13,  3 * SIZE(BO)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(45)	.align 4LL(42):	fmadd	f0,  f8, f10, f0	fmadd	f2,  f8, f11, f2	fmadd	f4,  f8, f12, f4	fmadd	f6,  f8, f13, f6	fmadd	f1,  f9, f10, f1	fmadd	f3,  f9, f11, f3	fmadd	f5,  f9, f12, f5	fmadd	f7,  f9, f13, f7	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	fmadd	f0,  f8, f10, f0	fmadd	f2,  f8, f11, f2	fmadd	f4,  f8, f12, f4	fmadd	f6,  f8, f13, f6	fmadd	f1,  f9, f10, f1	fmadd	f3,  f9, f11, f3	fmadd	f5,  f9, f12, f5	fmadd	f7,  f9, f13, f7	LFD	f8,   4 * SIZE(AO)	LFD	f9,   5 * SIZE(AO)	LFD	f10,  8 * SIZE(BO)	LFD	f11,  9 * SIZE(BO)	LFD	f12, 10 * SIZE(BO)	LFD	f13, 11 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO,  8 * SIZE	bdnz	LL(42)	.align 4LL(45):	andi.	r0,  K,  1	ble	LL(48)	.align 4LL(46):	fmadd	f0,  f8, f10, f0	fmadd	f2,  f8, f11, f2	fmadd	f4,  f8, f12, f4	fmadd	f6,  f8, f13, f6	fmadd	f1,  f9, f10, f1	fmadd	f3,  f9, f11, f3	fmadd	f5,  f9, f12, f5	fmadd	f7,  f9, f13, f7	addi	AO, AO,  2 * SIZE	addi	BO, BO,  4 * SIZE	.align 4LL(48):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	fsub	f0, f0, f3	fadd	f1, f1, f2	fsub	f4, f4, f7	fadd	f5, f5, f6#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)	fadd	f0, f0, f3	fsub	f1, f1, f2	fadd	f4, f4, f7	fsub	f5, f5, f6#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)	fadd	f0, f0, f3	fsub	f1, f2, f1	fadd	f4, f4, f7	fsub	f5, f6, f5#else /* RR, RC, CR, CC */	fsub	f0, f0, f3	fadd	f1, f1, f2	fsub	f4, f4, f7	fadd	f5, f5, f6#endif	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  1 * SIZE(CO1)	LFD	f10, 0 * SIZE(CO2)	LFD	f11, 1 * SIZE(CO2)	lfs	f12,  ALPHA_R + 0(SP)	lfs	f13,  ALPHA_I + 4(SP)#if defined(RR) || defined(RC) || defined(CR) || defined(CC)	fmadd	f8,  f12, f0, f8	fnmsub	f9,  f12, f1, f9	fmadd	f10, f12, f4, f10	fnmsub	f11, f12, f5, f11	fmadd	f8,  f13, f1, f8	fmadd	f9,  f13, f0, f9	fmadd	f10, f13, f5, f10	fmadd	f11, f13, f4, f11#else	fmadd	f8,  f12, f0, f8	fmadd	f9,  f12, f1, f9	fmadd	f10, f12, f4, f10	fmadd	f11, f12, f5, f11	fnmsub	f8,  f13, f1, f8	fmadd	f9,  f13, f0, f9	fnmsub	f10, f13, f5, f10	fmadd	f11, f13, f4, f11#endif	STFD	f8,  0 * SIZE(CO1)	STFD	f9,  1 * SIZE(CO1)	STFD	f10, 0 * SIZE(CO2)	STFD	f11, 1 * SIZE(CO2)LL(49):	mr	B, BO	addic.	J, J, -1	bgt	LL(01)	.align 4LL(50):	andi.	J, N,  1	ble	LL(999)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -