⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_altivec.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	vspltw	bp2, b1,  3	vmaddfp	c01, a5, bp2, c01	vmaddfp	c02, a6, bp2, c02	vmaddfp	c03, a7, bp2, c03	vmaddfp	c04, a8, bp2, c04	addi	AO, AO, 32 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	LOAD_B	b1, OFFSET_0, BO	bdnz	LL(133)	.align 4LL(135):	andi.	r0,  J,  3	ble+	LL(138)	cmpwi	cr0, r0, 3	bne	LL(136)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	addi	AO, AO, 16 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	vspltw	bp2, b1,  1	vmaddfp	c01, a1, bp2, c01	vmaddfp	c02, a2, bp2, c02	vmaddfp	c03, a3, bp2, c03	vmaddfp	c04, a4, bp2, c04	addi	AO, AO, 16 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	vspltw	bp1, b1,  2	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	addi	AO, AO, 16 * SIZE	addi	BO, BO,  3 * SIZE	b	LL(138)	.align 4LL(136):	cmpwi	cr0, r0, 2	bne	LL(137)	vspltw	bp1, b1,  0	vspltw	bp2, b1,  1	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	LOAD_A	a1, OFFSET_4, AO	LOAD_A	a2, OFFSET_5, AO	LOAD_A	a3, OFFSET_6, AO	LOAD_A	a4, OFFSET_7, AO	vmaddfp	c01, a1, bp2, c01	vmaddfp	c02, a2, bp2, c02	vmaddfp	c03, a3, bp2, c03	vmaddfp	c04, a4, bp2, c04	addi	AO, AO, 32 * SIZE	addi	BO, BO,  2 * SIZE	b	LL(138)	.align 4LL(137):	cmpwi	cr0, r0, 1	bne	LL(138)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c03, a3, bp1, c03	vmaddfp	c04, a4, bp1, c04	addi	AO, AO, 16 * SIZE	addi	BO, BO,  1 * SIZE	.align 4LL(138):	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvx	C4, OFFSET_3, CO1	lvx	C5, OFFSET_4, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	vmaddfp	c02, alpha, c02, C3	vmaddfp	c03, alpha, c03, C4	vmaddfp	c04, alpha, c04, C5	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	stvx	c03, OFFSET_3, CO1	stvx	c04, OFFSET_4, CO1	addi	CO1, CO1, 16 * SIZE	addic.	I, I, -1	bgt+	LL(130)	.align 4LL(140):	andi.	I, M,  8	ble	LL(150)	vxor	c01, c01, c01	vxor	c02, c02, c02	mr	BO, B	mr	J, K	andi.	r0,  B,  15	ble+	LL(141)	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_B	b1, OFFSET_0, BO	vspltw	bp1, b1,  2	vspltw	bp2, b1,  3	addi	AO, AO, 8 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(148)	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	addi	AO, AO, 8 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp2, c01	vmaddfp	c02, a2, bp2, c02	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(148)	.align 4LL(141):	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	LOAD_B	b1, OFFSET_0, BO	srawi.	r0,  J,  2	mtspr	CTR, r0	ble	LL(145)	.align 4LL(143):	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b1,  1	vmaddfp	c01, a3, bp2, c01	vmaddfp	c02, a4, bp2, c02	vspltw	bp1, b1,  2	vmaddfp	c01, a5, bp1, c01	vmaddfp	c02, a6, bp1, c02	vspltw	bp2, b1,  3	vmaddfp	c01, a7, bp2, c01	vmaddfp	c02, a8, bp2, c02	addi	AO, AO, 32 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_A	a5, OFFSET_4, AO	LOAD_A	a6, OFFSET_5, AO	LOAD_A	a7, OFFSET_6, AO	LOAD_A	a8, OFFSET_7, AO	LOAD_B	b1, OFFSET_0, BO	bdnz	LL(143)	.align 4LL(145):	andi.	r0,  J,  3	ble+	LL(148)	cmpwi	cr0, r0, 3	bne	LL(146)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vspltw	bp2, b1,  1	vmaddfp	c01, a3, bp2, c01	vmaddfp	c02, a4, bp2, c02	LOAD_A	a1, OFFSET_4, AO	LOAD_A	a2, OFFSET_5, AO	vspltw	bp1, b1,  2	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	addi	AO, AO, 24 * SIZE	addi	BO, BO,  3 * SIZE	b	LL(148)	.align 4LL(146):	cmpwi	cr0, r0, 2	bne	LL(147)	vspltw	bp1, b1,  0	vspltw	bp2, b1,  1	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	vmaddfp	c01, a3, bp2, c01	vmaddfp	c02, a4, bp2, c02	addi	AO, AO, 16 * SIZE	addi	BO, BO,  2 * SIZE	b	LL(148)	.align 4LL(147):	cmpwi	cr0, r0, 1	bne	LL(148)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vmaddfp	c02, a2, bp1, c02	addi	AO, AO,  8 * SIZE	addi	BO, BO,  1 * SIZE	.align 4LL(148):	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvx	C3, OFFSET_2, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	vmaddfp	c02, alpha, c02, C3	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	stvx	c02, OFFSET_2, CO1	addi	CO1, CO1, 8 * SIZE	.align 4LL(150):	andi.	I, M,  4	ble	LL(160)	vxor	c01, c01, c01	mr	BO, B	mr	J, K	andi.	r0,  B,  15	ble+	LL(151)	LOAD_A	a1, OFFSET_0, AO	LOAD_B	b1, OFFSET_0, BO	vspltw	bp1, b1,  2	vspltw	bp2, b1,  3	addi	AO, AO, 4 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp1, c01	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(158)	LOAD_A	a1, OFFSET_0, AO	addi	AO, AO, 4 * SIZE	addi	BO, BO, SIZE	vmaddfp	c01, a1, bp2, c01	subi	J, J, 1	cmpwi	cr0, J, 0	ble	LL(158)	.align 4LL(151):	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_B	b1, OFFSET_0, BO	srawi.	r0,  J,  2	mtspr	CTR, r0	ble	LL(155)	.align 4LL(153):	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1,  1	vmaddfp	c01, a2, bp2, c01	vspltw	bp1, b1,  2	vmaddfp	c01, a3, bp1, c01	vspltw	bp2, b1,  3	vmaddfp	c01, a4, bp2, c01	addi	AO, AO, 16 * SIZE	addi	BO, BO,  4 * SIZE	LOAD_A	a1, OFFSET_0, AO	LOAD_A	a2, OFFSET_1, AO	LOAD_A	a3, OFFSET_2, AO	LOAD_A	a4, OFFSET_3, AO	LOAD_B	b1, OFFSET_0, BO	bdnz	LL(153)	.align 4LL(155):	andi.	r0,  J,  3	ble+	LL(158)	cmpwi	cr0, r0, 3	bne	LL(156)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1,  1	vmaddfp	c01, a2, bp2, c01	vspltw	bp1, b1,  2	vmaddfp	c01, a3, bp1, c01	addi	AO, AO, 12 * SIZE	addi	BO, BO,  3 * SIZE	b	LL(158)	.align 4LL(156):	cmpwi	cr0, r0, 2	bne	LL(157)	vspltw	bp1, b1,  0	vspltw	bp2, b1,  1	vmaddfp	c01, a1, bp1, c01	vmaddfp	c01, a2, bp2, c01	addi	AO, AO,  8 * SIZE	addi	BO, BO,  2 * SIZE	b	LL(158)	.align 4LL(157):	cmpwi	cr0, r0, 1	bne	LL(158)	vspltw	bp1, b1,  0	vmaddfp	c01, a1, bp1, c01	addi	AO, AO,  4 * SIZE	addi	BO, BO,  1 * SIZE	.align 4LL(158):	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	lvx	C1, OFFSET_0, CO1	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	vmaddfp	c01, alpha, c01, C2	stvx	c00, OFFSET_0, CO1	stvx	c01, OFFSET_1, CO1	addi	CO1, CO1, 4 * SIZE	.align 4LL(160):	andi.	I, M,  2	ble	LL(170)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  2 * SIZE(AO)	LFD	f11,  3 * SIZE(AO)	LFD	f12,  0 * SIZE(B)	LFD	f13,  1 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(165)	.align 4LL(162):	FMADD	f0,  f8,  f12, f0	FMADD	f1,  f9,  f12, f1	FMADD	f2,  f10, f13, f2	FMADD	f3,  f11, f13, f3	LFD	f8,   4 * SIZE(AO)	LFD	f9,   5 * SIZE(AO)	LFD	f10,  6 * SIZE(AO)	LFD	f11,  7 * SIZE(AO)	LFD	f12,  2 * SIZE(BO)	LFD	f13,  3 * SIZE(BO)	addi	AO, AO,  4 * SIZE	addi	BO, BO,  2 * SIZE	bdnz	LL(162)	.align 4LL(165):	andi.	r0,  K,  1	lfs	f13,  ALPHA(SP)	ble	LL(168)	.align 4LL(166):	FMADD	f0,  f8, f12, f0	FMADD	f1,  f9, f12, f1	addi	AO, AO,  2 * SIZE	addi	BO, BO,  1 * SIZE	.align 4LL(168):	LFD	f8,  0 * SIZE(CO1)	LFD	f9,  1 * SIZE(CO1)	FADD	f0, f0, f2	FADD	f1, f1, f3	FMADD	f0,  f0, f13, f8	FMADD	f1,  f1, f13, f9	STFD	f0,  0 * SIZE(CO1)	STFD	f1,  1 * SIZE(CO1)	addi	CO1, CO1, 2 * SIZE	.align 4LL(170):	andi.	I, M,  1	ble	LL(999)	mr	BO, B	LFD	f8,   0 * SIZE(AO)	LFD	f9,   1 * SIZE(AO)	LFD	f10,  0 * SIZE(B)	LFD	f11,  1 * SIZE(B)	lfs	f0,  FZERO(SP) 	fmr	f1,  f0	srawi.	r0,  K,  1	mtspr	CTR, r0	ble	LL(175)	.align 4LL(172):	FMADD	f0,  f8, f10, f0	FMADD	f1,  f9, f11, f1	LFD	f8,   2 * SIZE(AO)	LFD	f9,   3 * SIZE(AO)	LFD	f10,  2 * SIZE(BO)	LFD	f11,  3 * SIZE(BO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  2 * SIZE	bdnz	LL(172)	.align 4LL(175):	andi.	r0,  K,  1	lfs	f13,  ALPHA(SP)	ble	LL(178)	.align 4LL(176):	FMADD	f0,  f8, f10, f0	addi	AO, AO,  1 * SIZE	addi	BO, BO,  1 * SIZE	.align 4LL(178):	LFD	f8,  0 * SIZE(CO1)	FADD	f0, f0, f1	FMADD	f0,  f0, f13, f8	STFD	f0,  0 * SIZE(CO1)	.align 4	LL(999):	mr	SP, STACK	li	r0,  0 * 16	lvx	v20, SP, r0	li	r0,  1 * 16	lvx	v21, SP, r0	li	r0,  2 * 16	lvx	v22, SP, r0	li	r0,  3 * 16	lvx	v23, SP, r0	li	r0,  4 * 16	lvx	v24, SP, r0	li	r0,  5 * 16	lvx	v25, SP, r0	li	r0,  6 * 16	lvx	v26, SP, r0	li	r0,  7 * 16	lvx	v27, SP, r0	li	r0,  8 * 16	lvx	v28, SP, r0	li	r0,  9 * 16	lvx	v29, SP, r0	li	r0, 10 * 16	lvx	v30, SP, r0	li	r0, 11 * 16	lvx	v31, SP, r0	mtspr	VRsave, VREG#ifdef __64BIT__	ld	r31,  192(SP)	ld	r30,  200(SP)	ld	r29,  208(SP)	ld	r28,  216(SP)	ld	r27,  224(SP)	ld	r26,  232(SP)	ld	r25,  240(SP)	ld	r24,  248(SP)	ld	r23,  256(SP)	ld	r22,  264(SP)	ld	r21,  272(SP)	ld	r20,  280(SP)	ld	r19,  288(SP)	ld	r18,  296(SP)	ld	r17,  304(SP)	ld	r16,  312(SP)	ld	r15,  320(SP)	ld	r14,  328(SP)#else	lwz	r31,  192(SP)	lwz	r30,  196(SP)	lwz	r29,  200(SP)	lwz	r28,  204(SP)	lwz	r27,  208(SP)	lwz	r26,  212(SP)	lwz	r25,  216(SP)	lwz	r24,  220(SP)	lwz	r23,  224(SP)	lwz	r22,  228(SP)	lwz	r21,  232(SP)	lwz	r20,  236(SP)	lwz	r19,  240(SP)	lwz	r18,  244(SP)	lwz	r17,  248(SP)	lwz	r16,  252(SP)	lwz	r15,  256(SP)	lwz	r14,  260(SP)#endif	addi	SP, SP, STACKSIZE	blr	EPILOGUE#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -