⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_altivec.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	stvx	c10, OFFSET_2, CO3	vperm	c14, c14,   c15,   PERMRSHIFT4	stvx	c11, OFFSET_3, CO3	vperm	c15, c15,   c16,   PERMRSHIFT4	stvx	c12, OFFSET_4, CO3	vperm	c16, c16,   VZERO, PERMRSHIFT4	vmaddfp	c00, alpha, c00, C1	vmaddfp	c13, alpha, c13, C6	vmaddfp	c14, alpha, c14, C7	vmaddfp	c15, alpha, c15, C8	vmaddfp	c16, alpha, c16, C9	stvx	c00, OFFSET_0, CO4	stvx	c13, OFFSET_1, CO4	stvx	c14, OFFSET_2, CO4	stvx	c15, OFFSET_3, CO4	stvx	c16, OFFSET_4, CO4	addi	CO1, CO1, 16 * SIZE	addi	CO2, CO2, 16 * SIZE	addi	CO3, CO3, 16 * SIZE	addi	CO4, CO4, 16 * SIZE		addic.	I, I, -1	bgt+	LL(11)	b	LL(20)	.align 4LL(19):	lvx	C6, OFFSET_1, CO2	lvx	C7, OFFSET_2, CO2	lvx	C8, OFFSET_3, CO2	lvx	C9, OFFSET_4, CO2	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	lvx	C2, OFFSET_1, CO3	vmaddfp	c02, alpha, c02, C3	lvx	C3, OFFSET_2, CO3	vmaddfp	c03, alpha, c03, C4	lvx	C4, OFFSET_3, CO3	vmaddfp	c04, alpha, c04, C5	lvx	C5, OFFSET_4, CO3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	stvx	c03, OFFSET_3, CO1	stvx	c04, OFFSET_4, CO1	lvx	C1, OFFSET_0, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   c06,   PERMRSHIFT2	vperm	c06, c06,   c07,   PERMRSHIFT2	vperm	c07, c07,   c08,   PERMRSHIFT2	vperm	c08, c08,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C6	lvx	C6, OFFSET_1, CO4	vmaddfp	c06, alpha, c06, C7	lvx	C7, OFFSET_2, CO4	vmaddfp	c07, alpha, c07, C8	lvx	C8, OFFSET_3, CO4	vmaddfp	c08, alpha, c08, C9	lvx	C9, OFFSET_4, CO4	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	stvx	c06, OFFSET_2, CO2	stvx	c07, OFFSET_3, CO2	stvx	c08, OFFSET_4, CO2	lvx	C1, OFFSET_0, CO3	vperm	c00, VZERO, c09,   PERMRSHIFT3	vperm	c09, c09,   c10,   PERMRSHIFT3	vperm	c10, c10,   c11,   PERMRSHIFT3	vperm	c11, c11,   c12,   PERMRSHIFT3	vperm	c12, c12,   VZERO, PERMRSHIFT3	vmaddfp	c00, alpha, c00, C1	vmaddfp	c09, alpha, c09, C2	vmaddfp	c10, alpha, c10, C3	vmaddfp	c11, alpha, c11, C4	vmaddfp	c12, alpha, c12, C5	stvx	c00, OFFSET_0, CO3	stvx	c09, OFFSET_1, CO3	stvx	c10, OFFSET_2, CO3	stvx	c11, OFFSET_3, CO3	stvx	c12, OFFSET_4, CO3	lvx	C1, OFFSET_0, CO4	vperm	c00, VZERO, c13,   PERMRSHIFT4	vperm	c13, c13,   c14,   PERMRSHIFT4	vperm	c14, c14,   c15,   PERMRSHIFT4	vperm	c15, c15,   c16,   PERMRSHIFT4	vperm	c16, c16,   VZERO, PERMRSHIFT4	vmaddfp	c00, alpha, c00, C1	vmaddfp	c13, alpha, c13, C6	vmaddfp	c14, alpha, c14, C7	vmaddfp	c15, alpha, c15, C8	vmaddfp	c16, alpha, c16, C9	stvx	c00, OFFSET_0, CO4	stvx	c13, OFFSET_1, CO4	stvx	c14, OFFSET_2, CO4	stvx	c15, OFFSET_3, CO4	stvx	c16, OFFSET_4, CO4	addi	CO1, CO1, 16 * SIZE	addi	CO2, CO2, 16 * SIZE	addi	CO3, CO3, 16 * SIZE	addi	CO4, CO4, 16 * SIZE		addic.	I, I, -1	bgt+	LL(11)	.align 4LL(20):	andi.	I, M,  8	ble	LL(30)	vxor	c01, c01, c01	LOAD_A	a1, OFFSET_0, AO	vxor	c02, c02, c02	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c09, c09, c09	LOAD_B	b1, OFFSET_0, B	vxor	c10, c10, c10	LOAD_B	b2, OFFSET_1, B	vxor	c13, c13, c13	vxor	c14, c14, c14	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(25)	.align 4LL(22):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	addi	AO, AO, 16 * SIZE	vmaddfp	c02, a2, bp1, c02	addi	BO, BO,  8 * SIZE	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c10, a2, bp1, c10	vmaddfp	c13, a1, bp2, c13	LOAD_A	a1, OFFSET_0, AO	vspltw	bp1, b2, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c01, a3, bp1, c01	vspltw	bp2, b2, 1	vmaddfp	c02, a4, bp1, c02	vmaddfp	c05, a3, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a4, bp2, c06	vmaddfp	c09, a3, bp1, c09	vspltw	bp2, b2, 3	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c10, a4, bp1, c10	vmaddfp	c13, a3, bp2, c13	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c14, a4, bp2, c14	LOAD_A	a4, OFFSET_3, AO	vspltw	bp1, b1, 0	bdnz	LL(22)	.align 4LL(25):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(28)	.align 4LL(26):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	nop	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	nop	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	addi	AO, AO,  8 * SIZE	vmaddfp	c13, a1, bp2, c13	addi	BO, BO,  4 * SIZE	vmaddfp	c14, a2, bp2, c14	nop	.align 4LL(28):	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvsr	PERMRSHIFT3, 0, CO3	lvsr	PERMRSHIFT4, 0, CO4	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	vmaddfp	c02, alpha, c02, C3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	lvx	C3, OFFSET_2, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   c06,   PERMRSHIFT2	vperm	c06, c06,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C2	vmaddfp	c06, alpha, c06, C3	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	stvx	c06, OFFSET_2, CO2	lvx	C1, OFFSET_0, CO3	lvx	C2, OFFSET_1, CO3	lvx	C3, OFFSET_2, CO3	vperm	c00, VZERO, c09,   PERMRSHIFT3	vperm	c09, c09,   c10,   PERMRSHIFT3	vperm	c10, c10,   VZERO, PERMRSHIFT3	vmaddfp	c00, alpha, c00, C1	vmaddfp	c09, alpha, c09, C2	vmaddfp	c10, alpha, c10, C3	stvx	c00, OFFSET_0, CO3	stvx	c09, OFFSET_1, CO3	stvx	c10, OFFSET_2, CO3	lvx	C1, OFFSET_0, CO4	lvx	C2, OFFSET_1, CO4	lvx	C3, OFFSET_2, CO4	vperm	c00, VZERO, c13,   PERMRSHIFT4	vperm	c13, c13,   c14,   PERMRSHIFT4	vperm	c14, c14,   VZERO, PERMRSHIFT4	vmaddfp	c00, alpha, c00, C1	vmaddfp	c13, alpha, c13, C2	vmaddfp	c14, alpha, c14, C3	stvx	c00, OFFSET_0, CO4	stvx	c13, OFFSET_1, CO4	stvx	c14, OFFSET_2, CO4	addi	CO1, CO1, 8 * SIZE	addi	CO2, CO2, 8 * SIZE	addi	CO3, CO3, 8 * SIZE	addi	CO4, CO4, 8 * SIZE	.align 4LL(30):	andi.	I, M,  4	ble	LL(40)	vxor	c01, c01, c01	LOAD_A	a1, OFFSET_0, AO	vxor	c02, c02, c02	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_B	b1, OFFSET_0, B	vxor	c06, c06, c06	LOAD_B	b2, OFFSET_1, B	vxor	c09, c09, c09	vxor	c10, c10, c10	vxor	c13, c13, c13	vxor	c14, c14, c14	vspltw	bp1, b1, 0	mr	BO, B	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(35)	.align 4LL(32):	vmaddfp	c01, a1, bp1, c01	addi	AO, AO,  8 * SIZE	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	addi	BO, BO,  8 * SIZE	vspltw	bp1, b1, 2	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c13, a1, bp2, c13	LOAD_A	a1, OFFSET_0, AO	vspltw	bp1, b2, 0	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b2, 1	vmaddfp	c06, a2, bp2, c06	vspltw	bp1, b2, 2	vmaddfp	c10, a2, bp1, c10	vspltw	bp2, b2, 3	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c14, a2, bp2, c14	LOAD_A	a2, OFFSET_1, AO	vspltw	bp1, b1, 0	bdnz	LL(32)	.align 4LL(35):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(38)	.align 4LL(36):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c13, a1, bp2, c13	addi	AO, AO,  4 * SIZE	addi	BO, BO,  4 * SIZE	.align 4LL(38):	vaddfp	c01, c01, c02	vaddfp	c05, c05, c06	vaddfp	c09, c09, c10	vaddfp	c13, c13, c14	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvsr	PERMRSHIFT3, 0, CO3	lvsr	PERMRSHIFT4, 0, CO4	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C2	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	lvx	C1, OFFSET_0, CO3	lvx	C2, OFFSET_1, CO3	vperm	c00, VZERO, c09,   PERMRSHIFT3	vperm	c09, c09,   VZERO, PERMRSHIFT3	vmaddfp	c00, alpha, c00, C1	vmaddfp	c09, alpha, c09, C2	stvx	c00, OFFSET_0, CO3	stvx	c09, OFFSET_1, CO3	lvx	C1, OFFSET_0, CO4	lvx	C2, OFFSET_1, CO4	vperm	c00, VZERO, c13,   PERMRSHIFT4	vperm	c13, c13,   VZERO, PERMRSHIFT4	vmaddfp	c00, alpha, c00, C1	vmaddfp	c13, alpha, c13, C2	stvx	c00, OFFSET_0, CO4	stvx	c13, OFFSET_1, CO4	addi	CO1, CO1, 4 * SIZE	addi	CO2, CO2, 4 * SIZE	addi	CO3, CO3, 4 * SIZE	addi	CO4, CO4, 4 * SIZE	.align 4LL(40):	andi.	I, M,  2	ble	LL(50)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(B)	LFD	f11,  1 * SIZE(B)	LFD	f12,  2 * SIZE(B)	LFD	f13,  3 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(45)	.align 4LL(42):	FMADD	f0,  f8, f10, f0	FMADD	f2,  f8, f11, f2	FMADD	f4,  f8, f12, f4	FMADD	f6,  f8, f13, f6	FMADD	f1,  f9, f10, f1	FMADD	f3,  f9, f11, f3	FMADD	f5,  f9, f12, f5	FMADD	f7,  f9, f13, f7	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	FMADD	f0,  f8, f10, f0	FMADD	f2,  f8, f11, f2	FMADD	f4,  f8, f12, f4	FMADD	f6,  f8, f13, f6	FMADD	f1,  f9, f10, f1	FMADD	f3,  f9, f11, f3	FMADD	f5,  f9, f12, f5	FMADD	f7,  f9, f13, f7	LFD	f8,   4 * SIZE(AO)	LFD	f9,   5 * SIZE(AO)	LFD	f10,  8 * SIZE(BO)	LFD	f11,  9 * SIZE(BO)	LFD	f12, 10 * SIZE(BO)	LFD	f13, 11 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO,  8 * SIZE	bdnz	LL(42)	.align 4LL(45):	andi.	r0,  K,  1	ble	LL(48)	.align 4LL(46):	FMADD	f0,  f8, f10, f0	FMADD	f2,  f8, f11, f2	FMADD	f4,  f8, f12, f4	FMADD	f6,  f8, f13, f6	FMADD	f1,  f9, f10, f1	FMADD	f3,  f9, f11, f3	FMADD	f5,  f9, f12, f5	FMADD	f7,  f9, f13, f7	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  4 * SIZE	.align 4LL(48):	lfs	f13,  ALPHA(SP)	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  1 * SIZE(CO1)	LFD	f10, 0 * SIZE(CO2)	LFD	f11, 1 * SIZE(CO2)	FMADD	f0,  f0, f13, f8	FMADD	f1,  f1, f13, f9	FMADD	f2,  f2, f13, f10	FMADD	f3,  f3, f13, f11	LFD	f8,  0 * SIZE(CO3)	LFD	f9,  1 * SIZE(CO3)	LFD	f10, 0 * SIZE(CO4)	LFD	f11, 1 * SIZE(CO4)	FMADD	f4,  f4, f13, f8	FMADD	f5,  f5, f13, f9	FMADD	f6,  f6, f13, f10	FMADD	f7,  f7, f13, f11	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	STFD	f2,  0 * SIZE(CO2)	STFD	f3,  1 * SIZE(CO2)	STFD	f4,  0 * SIZE(CO3)	STFD	f5,  1 * SIZE(CO3)	STFD	f6,  0 * SIZE(CO4)	STFD	f7,  1 * SIZE(CO4)	addi	CO1, CO1, 2 * SIZE	addi	CO2, CO2, 2 * SIZE	addi	CO3, CO3, 2 * SIZE	addi	CO4, CO4, 2 * SIZE	.align 4LL(50):	andi.	I, M,  1	ble	LL(59)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(B)	LFD	f11,  1 * SIZE(B)	LFD	f12,  2 * SIZE(B)	LFD	f13,  3 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(55)	.align 4LL(52):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f8, f11, f1	FMADD	f2,  f8, f12, f2	FMADD	f3,  f8, f13, f3	LFD	f8,   2 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	FMADD	f0,  f9, f10, f0	FMADD	f1,  f9, f11, f1	FMADD	f2,  f9, f12, f2	FMADD	f3,  f9, f13, f3	LFD	f9,   3 * SIZE(AO)	LFD	f10,  8 * SIZE(BO)	LFD	f11,  9 * SIZE(BO)	LFD	f12, 10 * SIZE(BO)	LFD	f13, 11 * SIZE(BO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  8 * SIZE	bdnz	LL(52)	.align 4LL(55):	andi.	r0,  K,  1	ble	LL(58)	.align 4LL(56):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f8, f11, f1	FMADD	f2,  f8, f12, f2	FMADD	f3,  f8, f13, f3	LFD	f8,   2 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	addi	AO, AO,  1 * SIZE	addi	BO, BO,  4 * SIZE	.align 4LL(58):	lfs	f13,  ALPHA(SP)	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  0 * SIZE(CO2)	LFD	f10, 0 * SIZE(CO3)	LFD	f11, 0 * SIZE(CO4)	FMADD	f0,  f0, f13, f8	FMADD	f1,  f1, f13, f9	FMADD	f2,  f2, f13, f10	FMADD	f3,  f3, f13, f11	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  0 * SIZE(CO2)	STFD	f2,  0 * SIZE(CO3)	STFD	f3,  0 * SIZE(CO4)	.align 4LL(59):

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -