⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_t.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	LFD	b6,  6 * SIZE(BO)	LFD	b7,  7 * SIZE(BO)	LFD	b8,  8 * SIZE(BO)	bdz	LL(33)	.align 4LL(32):	FMADD	y01, a1, b1, y01	LFD	a1,  5 * SIZE(AO1)	FMADD	y02, a2, b1, y02	LFD	a2,  5 * SIZE(AO2)	FMADD	y03, a3, b2, y03	LFD	a3,  6 * SIZE(AO1)	FMADD	y04, a4, b2, y04	LFD	a4,  6 * SIZE(AO2)	FMADD	y09, a5, b3, y09	LFD	a5,  7 * SIZE(AO1)	FMADD	y10, a6, b3, y10	LFD	a6,  7 * SIZE(AO2)	FMADD	y11, a7, b4, y11	LFD	a7,  8 * SIZE(AO1)	FMADD	y12, a8, b4, y12	LFD	a8,  8 * SIZE(AO2)	LFD	b1,  9 * SIZE(BO)	LFD	b2, 10 * SIZE(BO)	LFD	b3, 11 * SIZE(BO)	LFD	b4, 12 * SIZE(BO)	FMADD	y01, a1, b5, y01	LFD	a1,  9 * SIZE(AO1)	FMADD	y02, a2, b5, y02	LFD	a2,  9 * SIZE(AO2)	FMADD	y03, a3, b6, y03	LFD	a3, 10 * SIZE(AO1)	FMADD	y04, a4, b6, y04	LFD	a4, 10 * SIZE(AO2)	FMADD	y09, a5, b7, y09	LFD	a5, 11 * SIZE(AO1)	FMADD	y10, a6, b7, y10	LFD	a6, 11 * SIZE(AO2)	FMADD	y11, a7, b8, y11	LFD	a7, 12 * SIZE(AO1)	FMADD	y12, a8, b8, y12	LFD	a8, 12 * SIZE(AO2)	LFD	b5, 13 * SIZE(BO)	LFD	b6, 14 * SIZE(BO)	LFD	b7, 15 * SIZE(BO)	LFD	b8, 16 * SIZE(BO)	FMADD	y01, a1, b1, y01	LFD	a1, 13 * SIZE(AO1)	FMADD	y02, a2, b1, y02	LFD	a2, 13 * SIZE(AO2)	FMADD	y03, a3, b2, y03	LFD	a3, 14 * SIZE(AO1)	FMADD	y04, a4, b2, y04	LFD	a4, 14 * SIZE(AO2)	FMADD	y09, a5, b3, y09	LFD	a5, 15 * SIZE(AO1)	FMADD	y10, a6, b3, y10	LFD	a6, 15 * SIZE(AO2)	FMADD	y11, a7, b4, y11	LFD	a7, 16 * SIZE(AO1)	FMADD	y12, a8, b4, y12	LFD	a8, 16 * SIZE(AO2)	LFD	b1, 17 * SIZE(BO)	LFD	b2, 18 * SIZE(BO)	LFD	b3, 19 * SIZE(BO)	LFD	b4, 20 * SIZE(BO)	FMADD	y01, a1, b5, y01	LFD	a1, 17 * SIZE(AO1)	FMADD	y02, a2, b5, y02	LFD	a2, 17 * SIZE(AO2)	FMADD	y03, a3, b6, y03	LFD	a3, 18 * SIZE(AO1)	FMADD	y04, a4, b6, y04	LFD	a4, 18 * SIZE(AO2)	FMADD	y09, a5, b7, y09	LFD	a5, 19 * SIZE(AO1)	FMADD	y10, a6, b7, y10	LFD	a6, 19 * SIZE(AO2)	FMADD	y11, a7, b8, y11	LFD	a7, 20 * SIZE(AO1)	FMADD	y12, a8, b8, y12	LFD	a8, 20 * SIZE(AO2)	LFD	b5, 21 * SIZE(BO)	LFD	b6, 22 * SIZE(BO)	LFD	b7, 23 * SIZE(BO)	LFD	b8, 24 * SIZE(BO)	addi	AO1, AO1, 16 * SIZE	addi	AO2, AO2, 16 * SIZE	PREFETCH_A1	PREFETCH_A2	addi	BO,  BO,  16 * SIZE 	bdnz	LL(32)	.align 4	LL(33):	FMADD	y01, a1, b1, y01	LFD	a1,  5 * SIZE(AO1)	FMADD	y02, a2, b1, y02	LFD	a2,  5 * SIZE(AO2)	FMADD	y03, a3, b2, y03	LFD	a3,  6 * SIZE(AO1)	FMADD	y04, a4, b2, y04	LFD	a4,  6 * SIZE(AO2)	FMADD	y09, a5, b3, y09	LFD	a5,  7 * SIZE(AO1)	FMADD	y10, a6, b3, y10	LFD	a6,  7 * SIZE(AO2)	FMADD	y11, a7, b4, y11	LFD	a7,  8 * SIZE(AO1)	FMADD	y12, a8, b4, y12	LFD	a8,  8 * SIZE(AO2)	LFD	b1,  9 * SIZE(BO)	LFD	b2, 10 * SIZE(BO)	LFD	b3, 11 * SIZE(BO)	LFD	b4, 12 * SIZE(BO)	FMADD	y01, a1, b5, y01	LFD	a1,  9 * SIZE(AO1)	FMADD	y02, a2, b5, y02	LFD	a2,  9 * SIZE(AO2)	FMADD	y03, a3, b6, y03	LFD	a3, 10 * SIZE(AO1)	FMADD	y04, a4, b6, y04	LFD	a4, 10 * SIZE(AO2)	FMADD	y09, a5, b7, y09	LFD	a5, 11 * SIZE(AO1)	FMADD	y10, a6, b7, y10	LFD	a6, 11 * SIZE(AO2)	FMADD	y11, a7, b8, y11	LFD	a7, 12 * SIZE(AO1)	FMADD	y12, a8, b8, y12	LFD	a8, 12 * SIZE(AO2)	LFD	b5, 13 * SIZE(BO)	LFD	b6, 14 * SIZE(BO)	LFD	b7, 15 * SIZE(BO)	LFD	b8, 16 * SIZE(BO)	FMADD	y01, a1, b1, y01	LFD	a1, 13 * SIZE(AO1)	FMADD	y02, a2, b1, y02	LFD	a2, 13 * SIZE(AO2)	FMADD	y03, a3, b2, y03	LFD	a3, 14 * SIZE(AO1)	FMADD	y04, a4, b2, y04	LFD	a4, 14 * SIZE(AO2)	FMADD	y09, a5, b3, y09	LFD	a5, 15 * SIZE(AO1)	FMADD	y10, a6, b3, y10	LFD	a6, 15 * SIZE(AO2)	FMADD	y11, a7, b4, y11	LFD	a7, 16 * SIZE(AO1)	FMADD	y12, a8, b4, y12	LFD	a8, 16 * SIZE(AO2)	FMADD	y01, a1, b5, y01	FMADD	y02, a2, b5, y02	FMADD	y03, a3, b6, y03	FMADD	y04, a4, b6, y04	FMADD	y09, a5, b7, y09	FMADD	y10, a6, b7, y10	FMADD	y11, a7, b8, y11	FMADD	y12, a8, b8, y12	addi	AO1, AO1, 16 * SIZE	addi	AO2, AO2, 16 * SIZE	addi	BO,  BO,  16 * SIZE	.align 4LL(34):	andi.	r0, MIN_N, 15	ble	LL(38)	andi.	r0, MIN_N,  8	ble	LL(35)	LFD	a1,  1 * SIZE(AO1)	LFD	a2,  1 * SIZE(AO2)	LFD	a3,  2 * SIZE(AO1)	LFD	a4,  2 * SIZE(AO2)	LFD	b1,  1 * SIZE(BO)	LFD	b2,  2 * SIZE(BO)	LFD	b3,  3 * SIZE(BO)	LFD	b4,  4 * SIZE(BO)	LFD	a5,  3 * SIZE(AO1)	LFD	a6,  3 * SIZE(AO2)	LFD	a7,  4 * SIZE(AO1)	LFD	a8,  4 * SIZE(AO2)	LFD	b5,  5 * SIZE(BO)	LFD	b6,  6 * SIZE(BO)	LFD	b7,  7 * SIZE(BO)	LFD	b8,  8 * SIZE(BO)	FMADD	y01, a1, b1, y01	LFD	a1,  5 * SIZE(AO1)	FMADD	y02, a2, b1, y02	LFD	a2,  5 * SIZE(AO2)	FMADD	y09, a3, b2, y09	LFD	a3,  6 * SIZE(AO1)	FMADD	y10, a4, b2, y10	LFD	a4,  6 * SIZE(AO2)	FMADD	y01, a5, b3, y01	LFD	a5,  7 * SIZE(AO1)	FMADD	y02, a6, b3, y02	LFD	a6,  7 * SIZE(AO2)	FMADD	y09, a7, b4, y09	LFD	a7,  8 * SIZE(AO1)	FMADD	y10, a8, b4, y10	LFD	a8,  8 * SIZE(AO2)	FMADD	y01, a1, b5, y01	FMADD	y02, a2, b5, y02	FMADD	y09, a3, b6, y09	FMADD	y10, a4, b6, y10	FMADD	y01, a5, b7, y01	addi	AO1, AO1, 8 * SIZE	FMADD	y02, a6, b7, y02	addi	AO2, AO2, 8 * SIZE	FMADD	y09, a7, b8, y09	addi	BO,  BO,  8 * SIZE	FMADD	y10, a8, b8, y10	nop	.align 4LL(35):	andi.	r0, MIN_N,  4	ble	LL(36)	LFD	a1,  1 * SIZE(AO1)	LFD	a2,  1 * SIZE(AO2)	LFD	a3,  2 * SIZE(AO1)	LFD	a4,  2 * SIZE(AO2)	LFD	a5,  3 * SIZE(AO1)	LFD	a6,  3 * SIZE(AO2)	LFD	a7,  4 * SIZE(AO1)	LFD	a8,  4 * SIZE(AO2)	LFD	b1,  1 * SIZE(BO)	LFD	b2,  2 * SIZE(BO)	LFD	b3,  3 * SIZE(BO)	LFD	b4,  4 * SIZE(BO)	FMADD	y01, a1, b1, y01	FMADD	y02, a2, b1, y02	FMADD	y09, a3, b2, y09	FMADD	y10, a4, b2, y10	FMADD	y01, a5, b3, y01	addi	AO1, AO1, 4 * SIZE	FMADD	y02, a6, b3, y02	addi	AO2, AO2, 4 * SIZE	FMADD	y09, a7, b4, y09	addi	BO,  BO,  4 * SIZE	FMADD	y10, a8, b4, y10	.align 4LL(36):	andi.	r0, MIN_N,  2	ble	LL(37)	LFD	a1,  1 * SIZE(AO1)	LFD	a2,  1 * SIZE(AO2)	LFD	b1,  1 * SIZE(BO)	LFD	b2,  2 * SIZE(BO)	LFD	a3,  2 * SIZE(AO1)	LFD	a4,  2 * SIZE(AO2)	FMADD	y01, a1, b1, y01	FMADD	y02, a2, b1, y02	FMADD	y09, a3, b2, y09	FMADD	y10, a4, b2, y10	addi	AO1, AO1, 2 * SIZE	addi	AO2, AO2, 2 * SIZE	addi	BO,  BO,  2 * SIZE	.align 4LL(37):	andi.	r0, MIN_N,  1	ble	LL(38)	LFD	a1,  1 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	LFD	a2,  1 * SIZE(AO2)	FMADD	y01, a1, b1, y01	FMADD	y02, a2, b1, y02	.align 4LL(38):	mr	BO, CO	lfd	alpha, ALPHA	cmpi	cr0, 0, INCY, SIZE	bne	LL(39)	LFD	a1,  1 * SIZE(CO)	LFD	a2,  2 * SIZE(CO)	FADD	y01, y03, y01	FADD	y02, y04, y02	FADD	y09, y11, y09	FADD	y10, y12, y10	FADD	y01, y09, y01	FADD	y02, y10, y02	FMADD	a1, alpha, y01, a1	FMADD	a2, alpha, y02, a2	STFD	a1,  1 * SIZE(CO)	STFD	a2,  2 * SIZE(CO)	addi	CO, CO, 2 * SIZE	b	LL(40)	.align 4LL(39):	LFDUX	a1, CO, INCY	LFDUX	a2, CO, INCY	FADD	y01, y03, y01	FADD	y02, y04, y02	FADD	y09, y11, y09	FADD	y10, y12, y10	FADD	y01, y09, y01	FADD	y02, y10, y02	FMADD	a1, alpha, f0, a1	FMADD	a2, alpha, f1, a2	STFDUX	a1, BO, INCY	STFDUX	a2, BO, INCY	.align 4LL(40):	andi.	J, N, 1	ble	LL(99)	mr     AO1, A	add    A,   A, LDA	mr     BO, XP		lfd	 y01, FZERO	fmr	 y02, y01	fmr	 y03, y01	fmr	 y04, y01	fmr	 y09, y01	fmr	 y10, y01	fmr	 y11, y01	fmr	 y12, y01	PREFETCH_Y	srawi.	r0,  MIN_N, 4	mtspr	CTR, r0	ble	LL(44)	LFD	a1,  1 * SIZE(AO1)	LFD	a2,  2 * SIZE(AO1)	LFD	a3,  3 * SIZE(AO1)	LFD	a4,  4 * SIZE(AO1)	LFD	a5,  5 * SIZE(AO1)	LFD	a6,  6 * SIZE(AO1)	LFD	a7,  7 * SIZE(AO1)	LFD	a8,  8 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	LFD	b2,  2 * SIZE(BO)	LFD	b3,  3 * SIZE(BO)	LFD	b4,  4 * SIZE(BO)	LFD	b5,  5 * SIZE(BO)	LFD	b6,  6 * SIZE(BO)	LFD	b7,  7 * SIZE(BO)	LFD	b8,  8 * SIZE(BO)	bdz	LL(43)	.align 4LL(42):	FMADD	y01, a1, b1, y01	nop	LFD	a1,  9 * SIZE(AO1)	LFD	b1,  9 * SIZE(BO)	FMADD	y02, a2, b2, y02	nop	LFD	a2, 10 * SIZE(AO1)	LFD	b2, 10 * SIZE(BO)	FMADD	y03, a3, b3, y03	nop	LFD	a3, 11 * SIZE(AO1)	LFD	b3, 11 * SIZE(BO)	FMADD	y04, a4, b4, y04	nop	LFD	a4, 12 * SIZE(AO1)	LFD	b4, 12 * SIZE(BO)	FMADD	y01, a5, b5, y01	nop	LFD	a5, 13 * SIZE(AO1)	LFD	b5, 13 * SIZE(BO)	FMADD	y02, a6, b6, y02	nop	LFD	a6, 14 * SIZE(AO1)	LFD	b6, 14 * SIZE(BO)	FMADD	y03, a7, b7, y03	nop	LFD	a7, 15 * SIZE(AO1)	LFD	b7, 15 * SIZE(BO)	FMADD	y04, a8, b8, y04	nop	LFD	a8, 16 * SIZE(AO1)	LFD	b8, 16 * SIZE(BO)	FMADD	y01, a1, b1, y01	nop	LFD	a1, 17 * SIZE(AO1)	LFD	b1, 17 * SIZE(BO)	FMADD	y02, a2, b2, y02	nop	LFD	a2, 18 * SIZE(AO1)	LFD	b2, 18 * SIZE(BO)	FMADD	y03, a3, b3, y03	nop	LFD	a3, 19 * SIZE(AO1)	LFD	b3, 19 * SIZE(BO)	FMADD	y04, a4, b4, y04	nop	LFD	a4, 20 * SIZE(AO1)	LFD	b4, 20 * SIZE(BO)	FMADD	y01, a5, b5, y01	nop	LFD	a5, 21 * SIZE(AO1)	LFD	b5, 21 * SIZE(BO)	FMADD	y02, a6, b6, y02	nop	LFD	a6, 22 * SIZE(AO1)	LFD	b6, 22 * SIZE(BO)	FMADD	y03, a7, b7, y03	nop	LFD	a7, 23 * SIZE(AO1)	LFD	b7, 23 * SIZE(BO)	FMADD	y04, a8, b8, y04	nop	LFD	a8, 24 * SIZE(AO1)	LFD	b8, 24 * SIZE(BO)	addi	AO1, AO1, 16 * SIZE	addi	BO,  BO,  16 * SIZE	PREFETCH_A1 	bdnz	LL(42)	.align 4	LL(43):	FMADD	y01, a1, b1, y01	nop	LFD	a1,  9 * SIZE(AO1)	LFD	b1,  9 * SIZE(BO)	FMADD	y02, a2, b2, y02	nop	LFD	a2, 10 * SIZE(AO1)	LFD	b2, 10 * SIZE(BO)	FMADD	y03, a3, b3, y03	nop	LFD	a3, 11 * SIZE(AO1)	LFD	b3, 11 * SIZE(BO)	FMADD	y04, a4, b4, y04	nop	LFD	a4, 12 * SIZE(AO1)	LFD	b4, 12 * SIZE(BO)	FMADD	y01, a5, b5, y01	nop	LFD	a5, 13 * SIZE(AO1)	LFD	b5, 13 * SIZE(BO)	FMADD	y02, a6, b6, y02	nop	LFD	a6, 14 * SIZE(AO1)	LFD	b6, 14 * SIZE(BO)	FMADD	y03, a7, b7, y03	nop	LFD	a7, 15 * SIZE(AO1)	LFD	b7, 15 * SIZE(BO)	FMADD	y04, a8, b8, y04	nop	LFD	a8, 16 * SIZE(AO1)	LFD	b8, 16 * SIZE(BO)	FMADD	y01, a1, b1, y01	FMADD	y02, a2, b2, y02	FMADD	y03, a3, b3, y03	FMADD	y04, a4, b4, y04	FMADD	y01, a5, b5, y01	addi	AO1, AO1, 16 * SIZE	FMADD	y02, a6, b6, y02	addi	BO,  BO,  16 * SIZE	FMADD	y03, a7, b7, y03	nop	FMADD	y04, a8, b8, y04	nop	.align 4LL(44):	andi.	r0, MIN_N, 15	ble	LL(48)	andi.	r0, MIN_N,  8	ble	LL(45)	LFD	a1,  1 * SIZE(AO1)	LFD	a2,  2 * SIZE(AO1)	LFD	a3,  3 * SIZE(AO1)	LFD	a4,  4 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	LFD	b2,  2 * SIZE(BO)	LFD	b3,  3 * SIZE(BO)	LFD	b4,  4 * SIZE(BO)	LFD	a5,  5 * SIZE(AO1)	LFD	a6,  6 * SIZE(AO1)	LFD	a7,  7 * SIZE(AO1)	LFD	a8,  8 * SIZE(AO1)	LFD	b5,  5 * SIZE(BO)	LFD	b6,  6 * SIZE(BO)	LFD	b7,  7 * SIZE(BO)	LFD	b8,  8 * SIZE(BO)	FMADD	y01, a1, b1, y01	FMADD	y02, a2, b2, y02	FMADD	y03, a3, b3, y03	FMADD	y04, a4, b4, y04	FMADD	y01, a5, b5, y01	addi	AO1, AO1, 8 * SIZE	FMADD	y02, a6, b6, y02	addi	BO,  BO,  8 * SIZE	FMADD	y03, a7, b7, y03	nop	FMADD	y04, a8, b8, y04	nop	.align 4LL(45):	andi.	r0, MIN_N,  4	ble	LL(46)	LFD	a1,  1 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	LFD	a2,  2 * SIZE(AO1)	LFD	b2,  2 * SIZE(BO)	LFD	a3,  3 * SIZE(AO1)	LFD	b3,  3 * SIZE(BO)	LFD	a4,  4 * SIZE(AO1)	LFD	b4,  4 * SIZE(BO)	FMADD	y01, a1, b1, y01	addi	AO1, AO1, 4 * SIZE	FMADD	y02, a2, b2, y02	addi	AO2, AO2, 4 * SIZE	FMADD	y03, a3, b3, y03	addi	BO,  BO,  4 * SIZE	FMADD	y04, a4, b4, y04	nop	.align 4LL(46):	andi.	r0, MIN_N,  2	ble	LL(47)	LFD	a1,  1 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	LFD	a2,  2 * SIZE(AO1)	LFD	b2,  2 * SIZE(BO)	FMADD	y01, a1, b1, y01	addi	AO1, AO1, 2 * SIZE	FMADD	y02, a2, b2, y02	addi	BO,  BO,  2 * SIZE	.align 4LL(47):	andi.	r0, MIN_N,  1	ble	LL(48)	LFD	a1,  1 * SIZE(AO1)	LFD	b1,  1 * SIZE(BO)	FMADD	y01, a1, b1, y01	.align 4LL(48):	mr	BO, CO	lfd	alpha, ALPHA	cmpi	cr0, 0, INCY, SIZE	bne	LL(49)	LFD	a1,  1 * SIZE(CO)	FADD	y01, y02, y01	FADD	y03, y04, y03	FADD	y01, y03, y01	FMADD	a1, alpha, y01, a1	STFD	a1,  1 * SIZE(CO)	b	LL(99)	.align 4LL(49):	LFDUX	a1, CO, INCY	FADD	y01, y02, y01	FADD	y03, y04, y03	FADD	y01, y03, y01	FMADD	a1, alpha, f0, a1	STFDUX	a1, BO, INCY	.align 4LL(99):	subf	A, PLDA_M, A	addi	IS, IS, P	cmp	cr0, 0, IS, M	blt	LL(ISLoop)	.align 4LL(999):	li	r3, 0	lfd	f14,     0(SP)	lfd	f15,     8(SP)	lfd	f16,    16(SP)	lfd	f17,    24(SP)	lfd	f18,    32(SP)	lfd	f19,    40(SP)	lfd	f20,    48(SP)	lfd	f21,    56(SP)	lfd	f22,    64(SP)	lfd	f23,    72(SP)	lfd	f24,    80(SP)	lfd	f25,    88(SP)	lfd	f26,    96(SP)	lfd	f27,   104(SP)	lfd	f28,   112(SP)	lfd	f29,   120(SP)	lfd	f30,   128(SP)	lfd	f31,   136(SP)#ifdef __64BIT__	ld	r14,   160(SP)	ld	r15,   168(SP)	ld	r16,   176(SP)	ld	r17,   184(SP)	ld	r18,   192(SP)	ld	r19,   200(SP)	ld	r20,   208(SP)	ld	r21,   216(SP)	ld	r22,   224(SP)	ld	r23,   232(SP)	ld	r24,   240(SP)	ld	r25,   248(SP)	ld	r26,   256(SP)	ld	r27,   264(SP)	ld	r28,   272(SP)	ld	r29,   280(SP)#else	lwz	r14,   160(SP)	lwz	r15,   164(SP)	lwz	r16,   168(SP)	lwz	r17,   172(SP)	lwz	r18,   176(SP)	lwz	r19,   180(SP)	lwz	r20,   184(SP)	lwz	r21,   188(SP)	lwz	r22,   192(SP)	lwz	r23,   196(SP)	lwz	r24,   200(SP)	lwz	r25,   204(SP)	lwz	r26,   208(SP)	lwz	r27,   212(SP)	lwz	r28,   216(SP)	lwz	r29,   220(SP)#endif	addi	SP, SP, STACKSIZE	blr	EPILOGUE#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -