⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_altivec_cell.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	addic.	J, J, -1	bgt	LL(01)	.align 4LL(60):	andi.	r0, N,  2	ble	LL(120)	mr	CO1, C	add	CO2, C,  LDC	add	C,  CO2, LDC	mr	AO, A	srawi.	I, M,  4	ble	LL(80)	.align 4LL(71):	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	vxor	c03, c03, c03	LOAD_A	a1, OFFSET_0, AO	vxor	c04, c04, c04	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c07, c07, c07	vxor	c08, c08, c08	mr	BO, B	dcbtst	CO1, PREC	dcbtst	CO2, PREC	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(75)	.align 4LL(72):	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	vmaddfp	c01, a5, bp1, c01	vspltw	bp2, b1, 3	vmaddfp	c02, a6, bp1, c02	vmaddfp	c03, a7, bp1, c03	vmaddfp	c04, a8, bp1, c04	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c05, a5, bp2, c05	vmaddfp	c06, a6, bp2, c06	vmaddfp	c07, a7, bp2, c07	vmaddfp	c08, a8, bp2, c08	addi	AO, AO, 32 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	bdnz	LL(72)	.align 4LL(75):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(78)	.align 4LL(76):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	addi	AO, AO, 16 * SIZE	vmaddfp	c03, a3, bp1, c03	addi	BO, BO,  2 * SIZE	vmaddfp	c04, a4, bp1, c04	nop	vmaddfp	c05, a1, bp2, c05	vmaddfp	c06, a2, bp2, c06	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	.align 4LL(78):	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvx	C4, OFFSET_3, CO1	lvx	C5, OFFSET_4, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvsr	PERMRSHIFT3, 0, CO3	lvsr	PERMRSHIFT4, 0, CO4	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	vmaddfp	c02, alpha, c02, C3	vmaddfp	c03, alpha, c03, C4	vmaddfp	c04, alpha, c04, C5	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	stvx	c03, OFFSET_3, CO1	stvx	c04, OFFSET_4, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	lvx	C3, OFFSET_2, CO2	lvx	C4, OFFSET_3, CO2	lvx	C5, OFFSET_4, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   c06,   PERMRSHIFT2	vperm	c06, c06,   c07,   PERMRSHIFT2	vperm	c07, c07,   c08,   PERMRSHIFT2	vperm	c08, c08,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C2	vmaddfp	c06, alpha, c06, C3	vmaddfp	c07, alpha, c07, C4	vmaddfp	c08, alpha, c08, C5	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	stvx	c06, OFFSET_2, CO2	stvx	c07, OFFSET_3, CO2	stvx	c08, OFFSET_4, CO2	addi	CO1, CO1, 16 * SIZE	addi	CO2, CO2, 16 * SIZE	addic.	I, I, -1	bgt+	LL(71)	.align 4LL(80):	andi.	I, M,  8	ble	LL(90)	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	vxor	c03, c03, c03	LOAD_A	a1, OFFSET_0, AO	vxor	c04, c04, c04	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	LOAD_A	a3, OFFSET_2, AO	vxor	c06, c06, c06	LOAD_A	a4, OFFSET_3, AO	vxor	c07, c07, c07	vxor	c08, c08, c08	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(85)	.align 4LL(82):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	vmaddfp	c03, a3, bp1, c03	vspltw	bp2, b1, 3	vmaddfp	c04, a4, bp1, c04	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c07, a3, bp2, c07	vmaddfp	c08, a4, bp2, c08	addi	AO, AO, 16 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	bdnz	LL(82)	.align 4LL(85):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(88)	.align 4LL(86):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	addi	AO, AO,  8 * SIZE	vmaddfp	c05, a1, bp2, c05	addi	BO, BO,  2 * SIZE	vmaddfp	c06, a2, bp2, c06	.align 4LL(88):	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	vaddfp	c01, c01, c03	vaddfp	c02, c02, c04	vaddfp	c05, c05, c07	vaddfp	c06, c06, c08	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvsr	PERMRSHIFT3, 0, CO3	lvsr	PERMRSHIFT4, 0, CO4	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	vmaddfp	c02, alpha, c02, C3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	lvx	C3, OFFSET_2, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   c06,   PERMRSHIFT2	vperm	c06, c06,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C2	vmaddfp	c06, alpha, c06, C3	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	stvx	c06, OFFSET_2, CO2	addi	CO1, CO1, 8 * SIZE	addi	CO2, CO2, 8 * SIZE	.align 4LL(90):	andi.	I, M,  4	ble	LL(100)	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	vxor	c05, c05, c05	vxor	c06, c06, c06	mr	BO, B	vspltw	bp1, b1, 0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(95)	.align 4LL(92):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b1, 3	LOAD_B	b1, OFFSET_1, BO	vspltw	bp1, b1, 0	vmaddfp	c06, a2, bp2, c06	addi	AO, AO,  8 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	bdnz	LL(92)	.align 4LL(95):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(98)	.align 4LL(96):	vspltw	bp2, b1, 1	vmaddfp	c01, a1, bp1, c01	vmaddfp	c05, a1, bp2, c05	addi	AO, AO,  4 * SIZE	addi	BO, BO,  2 * SIZE	.align 4LL(98):	vaddfp	c01, c01, c02	vaddfp	c05, c05, c06	vaddfp	c09, c09, c10	vaddfp	c13, c13, c14	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvsr	PERMRSHIFT3, 0, CO3	lvsr	PERMRSHIFT4, 0, CO4	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	lvx	C1, OFFSET_0, CO2	lvx	C2, OFFSET_1, CO2	vperm	c00, VZERO, c05,   PERMRSHIFT2	vperm	c05, c05,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	vmaddfp	c05, alpha, c05, C2	stvx	c00, OFFSET_0, CO2	stvx	c05, OFFSET_1, CO2	addi	CO1, CO1, 4 * SIZE	addi	CO2, CO2, 4 * SIZE	.align 4LL(100):	andi.	I, M,  2	ble	LL(110)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(B)	LFD	f11,  1 * SIZE(B)	LFD	f12,  2 * SIZE(B)	LFD	f13,  3 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(105)	.align 4LL(102):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f9, f10, f1	FMADD	f2,  f8, f11, f2	FMADD	f3,  f9, f11, f3	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	FMADD	f4,  f8, f12, f4	FMADD	f5,  f9, f12, f5	FMADD	f6,  f8, f13, f6	FMADD	f7,  f9, f13, f7	LFD	f8,  4 * SIZE(AO)	LFD	f9,  5 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO,  4 * SIZE	bdnz	LL(102)	.align 4LL(105):	andi.	r0,  K,  1	lfs	f13,  ALPHA(SP)	ble	LL(108)	.align 4LL(106):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f9, f10, f1	FMADD	f2,  f8, f11, f2	FMADD	f3,  f9, f11, f3	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  2 * SIZE(BO)	LFD	f11,  3 * SIZE(BO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  2 * SIZE	.align 4LL(108):	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  1 * SIZE(CO1)	LFD	f10, 0 * SIZE(CO2)	LFD	f11, 1 * SIZE(CO2)	FADD	f0, f0, f4	FADD	f1, f1, f5	FADD	f2, f2, f6	FADD	f3, f3, f7	FMADD	f0,  f0, f13, f8	FMADD	f1,  f1, f13, f9	FMADD	f2,  f2, f13, f10	FMADD	f3,  f3, f13, f11	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	STFD	f2,  0 * SIZE(CO2)	STFD	f3,  1 * SIZE(CO2)	addi	CO1, CO1, 2 * SIZE	addi	CO2, CO2, 2 * SIZE	.align 4LL(110):	andi.	I, M,  1	ble	LL(119)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(B)	LFD	f11,  1 * SIZE(B)	LFD	f12,  2 * SIZE(B)	LFD	f13,  3 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(115)	.align 4LL(112):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f8, f11, f1	FMADD	f2,  f9, f12, f2	FMADD	f3,  f9, f13, f3	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  4 * SIZE(BO)	LFD	f11,  5 * SIZE(BO)	LFD	f12,  6 * SIZE(BO)	LFD	f13,  7 * SIZE(BO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  4 * SIZE	bdnz	LL(112)	.align 4LL(115):	andi.	r0,  K,  1	lfs	f13,  ALPHA(SP)	ble	LL(118)	.align 4LL(116):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f8, f11, f1	LFD	f8,   1 * SIZE(AO)	LFD	f10,  2 * SIZE(BO)	LFD	f11,  3 * SIZE(BO)	addi	AO, AO,  1 * SIZE	addi	BO, BO,  2 * SIZE	.align 4LL(118):	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  0 * SIZE(CO2)	FADD	f0, f0, f2	FADD	f1, f1, f3		FMADD	f0,  f0, f13, f8	FMADD	f1,  f1, f13, f9	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  0 * SIZE(CO2)	.align 4LL(119):	mr	B, BO	.align 4LL(120):	andi.	r0, N,  1	ble	LL(999)	mr	CO1, C	mr	AO, A	srawi.	I, M,  4	ble	LL(140)	.align 4LL(130):	vxor	c01, c01, c01	vxor	c02, c02, c02	vxor	c03, c03, c03	vxor	c04, c04, c04	mr	BO, B	dcbtst	CO1, PREC	mr	J, K	andi.	r0,  B,  15	ble+	LL(131)	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_B	b1, OFFSET_0, BO	vspltw	bp1, b1,  2	vspltw	bp2, b1,  3	addi	AO, AO, 16 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(138)	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	addi	AO, AO, 16 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp2, c01	vmaddfp	c02, a2, bp2, c02	vmaddfp	c03, a3, bp2, c03	vmaddfp	c04, a4, bp2, c04	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(138)	.align 4LL(131):	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	LOAD_B	b1, OFFSET_0, BO	srawi.	r0,  J,  2	mtspr	CTR, r0	ble	LL(135)	.align 4LL(133):	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	vspltw	bp2, b1,  1	vmaddfp	c01, a5, bp2, c01	vmaddfp	c02, a6, bp2, c02	vmaddfp	c03, a7, bp2, c03	vmaddfp	c04, a8, bp2, c04	addi	AO, AO, 32 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	vspltw	bp1, b1,  2	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -