⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_4x4.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	ADD	c05, t2, c05	LD	b1,  0 * SIZE(BO)	MUL	a1, b2, t2	LD	b2,  1 * SIZE(BO)	ADD	c09, t3, c09	MUL	a1, b3, t3	LD	b3,  2 * SIZE(BO)	ADD	c13, t4, c13	MUL	a1, b4, t4	LD	a1,  0 * SIZE(AO)	lda	AO,  1 * SIZE(AO)	ADD	c01, t1, c01	LD	b4,  3 * SIZE(BO)	MUL	a1, b1, t1	lda	BO,  4 * SIZE(BO)	.align 4$L38:	ADD	c05, t2, c05	unop	MUL	a1, b2, t2	LD	a5,  0 * SIZE(C1)	ADD	c09, t3, c09	unop	MUL	a1, b3, t3	LD	b5,  0 * SIZE(C2)	ADD	c13, t4, c13	unop	MUL	a1, b4, t4	LD	a2,  0 * SIZE(C3)	ADD	c01, t1, c01	unop	MUL	alpha, c01, c01	LD	a3,  0 * SIZE(C4)	ADD	c05, t2, c05	unop	MUL	alpha, c05, c05	unop	ADD	c09, t3, c09	MUL	alpha, c09, c09	ADD	c13, t4, c13	MUL	alpha, c13, c13	ADD	c01,  a5, c01	ADD	c05,  b5, c05	ADD	c09,  a2, c09	ADD	c13,  a3, c13	ST	c01,  0 * SIZE(C1)	ST	c05,  0 * SIZE(C2)	ST	c09,  0 * SIZE(C3)	ST	c13,  0 * SIZE(C4)	.align 4$L39: 	mov	BO, B	unop	unop	bgt	J, $L01	.align 4$L40:	and	N, 2, J	mov	C,  C1	addq	C,  LDC, C2	ble	J, $L80		mov	A, AO	lda	J,        -1(J)	unop	addq	C2, LDC, C	.align 4$L50:	sra	M,  2, I	fclr	t1	fclr	t2	fclr	t3	fclr	t4	fclr	c01	fclr	c05 	fclr	c02	fclr	c06	ble	I, $L60	.align 4$L51:	LD	a1,  0 * SIZE(AO)	fclr	c03	LD	a2,  1 * SIZE(AO)	fclr	c07	LD	a3,  2 * SIZE(AO)	fclr	c04	LD	a4,  3 * SIZE(AO)	fclr	c08	LD	b1,  0 * SIZE(B)	LD	b2,  1 * SIZE(B)	LD	b3,  2 * SIZE(B)	LD	b4,  3 * SIZE(B)	lda	L,        -2(K)	lda	BO,  2 * SIZE(B)	lda	AO,  4 * SIZE(AO)	ble	L, $L55	.align	4$L52:	ADD	c05, t1, c05	unop	MUL	a1, b1, t1	unop	ADD	c06, t2, c06	lda	L,   -2(L)	MUL	a2, b1, t2	unop	ADD	c07, t3, c07	unop	MUL	a3, b1, t3	unop	ADD	c08, t4, c08	unop	MUL	a4, b1, t4	LD	b1,  2 * SIZE(BO)	ADD	c01, t1, c01	unop	MUL	a1, b2, t1	LD	a1,  0 * SIZE(AO)	ADD	c02, t2, c02	lda	BO,  4 * SIZE(BO)	MUL	a2, b2, t2	LD	a2,  1 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b2, t3	LD	a3,  2 * SIZE(AO)	ADD	c04, t4, c04	unop	MUL	a4, b2, t4	LD	a5,  3 * SIZE(AO)	ADD	c05, t1, c05	unop	MUL	a1, b3, t1	LD	b2, -1 * SIZE(BO)	ADD	c06, t2, c06	unop	MUL	a2, b3, t2	unop	ADD	c07, t3, c07	unop	MUL	a3, b3, t3	lda	AO,  8 * SIZE(AO)	ADD	c08, t4, c08	unop	MUL	a5, b3, t4	LD	b3,  0 * SIZE(BO)	ADD	c01, t1, c01	unop	MUL	a1, b4, t1	LD	a1, -4 * SIZE(AO)	ADD	c02, t2, c02	unop	MUL	a2, b4, t2	LD	a2, -3 * SIZE(AO)	ADD	c03, t3, c03	LD	a4, -1 * SIZE(AO)	MUL	a3, b4, t3	LD	a3, -2 * SIZE(AO)	ADD	c04, t4, c04	MUL	a5, b4, t4	LD	b4,  1 * SIZE(BO)	bgt	L,  $L52	.align 4$L55:	ADD	c05, t1, c05	ldt	alpha, ALPHA	MUL	a1, b1, t1	blbs	K, $L58	.align 4	ADD	c06, t2, c06	MUL	a2, b1, t2	ADD	c07, t3, c07	MUL	a3, b1, t3	ADD	c08, t4, c08	unop	MUL	a4, b1, t4	LD	b1,  0 * SIZE(BO)	ADD	c01, t1, c01	unop	MUL	a1, b2, t1	LD	a1,  0 * SIZE(AO)	ADD	c02, t2, c02	unop	MUL	a2, b2, t2	LD	a2,  1 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b2, t3	LD	a3,  2 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b2, t4	LD	a4,  3 * SIZE(AO)	lda	AO,  4 * SIZE(AO)	ADD	c05, t1, c05	LD	b2,  1 * SIZE(BO)	MUL	a1, b1, t1	lda	BO,  2 * SIZE(BO)	.align 4$L58:	ADD	c06, t2, c06	unop	MUL	a2, b1, t2	LD	c09,  0 * SIZE(C1)	ADD	c07, t3, c07	unop	MUL	a3, b1, t3	LD	c10,  1 * SIZE(C1)	ADD	c08, t4, c08	unop	MUL	a4, b1, t4	LD	c11,  2 * SIZE(C1)	ADD	c01, t1, c01	unop	MUL	a1, b2, t1	LD	c12,  3 * SIZE(C1)	ADD	c02, t2, c02	unop	MUL	a2, b2, t2	LD	c13,  0 * SIZE(C2)	ADD	c03, t3, c03	unop	MUL	a3, b2, t3	LD	c14,  1 * SIZE(C2)	ADD	c04, t4, c04	unop	MUL	a4, b2, t4	LD	c15,  2 * SIZE(C2)	ADD	c05, t1, c05	unop	MUL	alpha, c01, c01	LD	c16,  3 * SIZE(C2)	ADD	c06, t2, c06	unop	MUL	alpha, c02, c02	unop	ADD	c07, t3, c07	MUL	alpha, c03, c03	ADD	c08, t4, c08	MUL	alpha, c04, c04	MUL	alpha, c05, c05	ADD	c01,  c09, c01	MUL	alpha, c06, c06	ADD	c02,  c10, c02	MUL	alpha, c07, c07	ADD	c03,  c11, c03	MUL	alpha, c08, c08	ADD	c04,  c12, c04	ADD	c05,  c13, c05	ST	c01,  0 * SIZE(C1)	fclr	t1	lda	I,        -1(I)	ADD	c06,  c14, c06	ST	c02,  1 * SIZE(C1)	fclr	t2	unop	ADD	c07,  c15, c07	ST	c03,  2 * SIZE(C1)	fclr	t3	lda	C2,   4 * SIZE(C2)	ADD	c08,  c16, c08	ST	c04,  3 * SIZE(C1)	fclr	t4	lda	C1,   4 * SIZE(C1)	ST	c05, -4 * SIZE(C2)	fclr	c01	ST	c06, -3 * SIZE(C2)	fclr	c05	ST	c07, -2 * SIZE(C2) 	fclr	c02	ST	c08, -1 * SIZE(C2)	fclr	c06	bgt	I, $L51	.align 4$L60:	and	M,  2, I	ble	I, $L70	.align 4$L61:	LD	a1,  0 * SIZE(AO)	LD	a2,  1 * SIZE(AO)	LD	a3,  2 * SIZE(AO)	LD	a4,  3 * SIZE(AO)	LD	b1,  0 * SIZE(B)	lda	L,        -2(K)	LD	b2,  1 * SIZE(B)	lda	AO,  2 * SIZE(AO)	LD	b3,  2 * SIZE(B)	LD	b4,  3 * SIZE(B)	lda	BO,  2 * SIZE(B)	ble	L, $L65	.align	4$L62:	ADD	c01, t1, c01	unop	MUL	a1, b1, t1	unop	ADD	c02, t2, c02	lda	AO,    4 * SIZE(AO)	MUL	a2, b1, t2	LD	b1,  2 * SIZE(BO)	ADD	c05, t3, c05	lda	L,        -2(L)	MUL	a1, b2, t3	LD	a1, -2 * SIZE(AO)	ADD	c06, t4, c06	unop	MUL	a2, b2, t4	LD	a2, -1 * SIZE(AO)	ADD	c01, t1, c01	LD	b2,  3 * SIZE(BO)	MUL	a3, b3, t1	lda	BO,    4 * SIZE(BO)	ADD	c02, t2, c02	unop	MUL	a4, b3, t2	LD	b3,  0 * SIZE(BO)	ADD	c05, t3, c05	unop	MUL	a3, b4, t3	LD	a3,  0 * SIZE(AO)	ADD	c06, t4, c06	MUL	a4, b4, t4	LD	b4,  1 * SIZE(BO)	unop	LD	a4,  1 * SIZE(AO)	unop	unop	bgt	L,  $L62	.align 4$L65:	ADD	c01, t1, c01	ldt	alpha, ALPHA	MUL	a1, b1, t1	blbs	K, $L68	.align 4	ADD	c02, t2, c02	unop	MUL	a2, b1, t2	LD	b1,  0 * SIZE(BO)	ADD	c05, t3, c05	lda	BO,  2 * SIZE(BO)	MUL	a1, b2, t3	LD	a1,  0 * SIZE(AO)	ADD	c06, t4, c06	unop	MUL	a2, b2, t4	LD	a2,  1 * SIZE(AO)	ADD	c01, t1, c01	LD	b2, -1 * SIZE(BO)	MUL	a1, b1, t1	lda	AO,  2 * SIZE(AO)	.align 4$L68:	ADD	c02, t2, c02	unop	MUL	a2, b1, t2	LD	c09,  0 * SIZE(C1)	ADD	c05, t3, c05	unop	MUL	a1, b2, t3	LD	c10,  1 * SIZE(C1)	ADD	c06, t4, c06	unop	MUL	a2, b2, t4	LD	c11,  0 * SIZE(C2)	ADD	c01, t1, c01	unop	MUL	alpha, c01, c01	LD	c12,  1 * SIZE(C2)	ADD	c02, t2, c02	lda	C1,   2 * SIZE(C1)	MUL	alpha, c02, c02	lda	C2,   2 * SIZE(C2)	ADD	c05, t3, c05	MUL	alpha, c05, c05	ADD	c06, t4, c06	MUL	alpha, c06, c06	ADD	c01,  c09, c01	fclr	t1	ADD	c02,  c10, c02	fclr	t2	ADD	c05,  c11, c05	fclr	t3	ADD	c06,  c12, c06	fclr	t4	ST	c01, -2 * SIZE(C1)	fclr	c01	ST	c02, -1 * SIZE(C1) 	fclr	c02	ST	c05, -2 * SIZE(C2)	fclr	c05	ST	c06, -1 * SIZE(C2)	fclr	c06	.align 4$L70:	and	M,  1, I	ble	I, $L79	.align 4$L71:	LD	a1,  0 * SIZE(AO)	LD	a2,  1 * SIZE(AO)	LD	b1,  0 * SIZE(B)	lda	L,        -2(K)	LD	b2,  1 * SIZE(B)	lda	AO,  1 * SIZE(AO)	LD	b3,  2 * SIZE(B)	LD	b4,  3 * SIZE(B)	lda	BO,  2 * SIZE(B)	ble	L, $L75	.align	4$L72:	ADD	c01, t1, c01	lda	L,        -2(L)	MUL	a1, b1, t1	LD	b1,  2 * SIZE(BO)	ADD	c05, t2, c05	MUL	a1, b2, t2	LD	a1,  1 * SIZE(AO)	LD	b2,  3 * SIZE(BO)	ADD	c02, t3, c02	lda	AO,    2 * SIZE(AO)	MUL	a2, b3, t3	LD	b3,  4 * SIZE(BO)	ADD	c06, t4, c06	MUL	a2, b4, t4	LD	a2,  0 * SIZE(AO)	LD	b4,  5 * SIZE(BO)	lda	BO,    4 * SIZE(BO)	unop	unop	bgt	L,  $L72	.align 4$L75:	ADD	c01, t1, c01	ldt	alpha, ALPHA	MUL	a1, b1, t1	blbs	K, $L78	.align 4	ADD	c05, t2, c05	MUL	a1, b2, t2	LD	a1,  0 * SIZE(AO)	LD	b1,  0 * SIZE(BO)	ADD	c01, t1, c01	LD	b2,  1 * SIZE(BO)	lda	AO,  1 * SIZE(AO)	MUL	a1, b1, t1	lda	BO,  2 * SIZE(BO)	.align 4$L78:	ADD	c05, t2, c05	MUL	a1, b2, t2	LD	a5,  0 * SIZE(C1)	ADD	c02, t3, c02	ADD	c06, t4, c06	LD	b5,  0 * SIZE(C2)	ADD	c01, c02, c01	ADD	c05, c06, c05	ADD	c01, t1, c01	ADD	c05, t2, c05	MUL	alpha, c01, c01	MUL	alpha, c05, c05	ADD	c01,  a5, c01	ADD	c05,  b5, c05	ST	c01,  0 * SIZE(C1)	ST	c05,  0 * SIZE(C2)	.align 4$L79: 	mov	BO, B	unop	unop	.align 4$L80:	and	N, 1, J	mov	C,  C1	mov	A, AO	ble	J, $L999	.align 4$L90:	sra	M,  2, I	fclr	t1	fclr	t2	fclr	t3	fclr	t4	fclr	c01	fclr	c02 	fclr	c03	fclr	c04	ble	I, $L100	.align 4$L91:	LD	a1,  0 * SIZE(AO)	LD	a2,  1 * SIZE(AO)	LD	a3,  2 * SIZE(AO)	LD	a4,  3 * SIZE(AO)	LD	b1,  0 * SIZE(B)	LD	b2,  1 * SIZE(B)	LD	b3,  2 * SIZE(B)	LD	b4,  3 * SIZE(B)	sra	K, 2, L	mov	B, BO	unop	ble	L, $L95	.align	5$L92:	ADD	c01, t1, c01	unop	MUL	a1, b1, t1	LD	a1,  4 * SIZE(AO)	ADD	c02, t2, c02	lda	L,   -1(L)	MUL	a2, b1, t2	LD	a2,  5 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b1, t3	LD	a3,  6 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b1, t4	LD	a4,  7 * SIZE(AO)	LD	b1,  4 * SIZE(BO)	ADD	c01, t1, c01	unop	MUL	a1, b2, t1	LD	a1,  8 * SIZE(AO)	ADD	c02, t2, c02	unop	MUL	a2, b2, t2	LD	a2,  9 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b2, t3	LD	a3, 10 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b2, t4	LD	a4, 11 * SIZE(AO)	LD	b2,  5 * SIZE(BO)	ADD	c01, t1, c01	unop	MUL	a1, b3, t1	LD	a1, 12 * SIZE(AO)	ADD	c02, t2, c02	unop	MUL	a2, b3, t2	LD	a2, 13 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b3, t3	LD	a3, 14 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b3, t4	LD	a5, 15 * SIZE(AO)	LD	b3,  6 * SIZE(BO)	ADD	c01, t1, c01	MUL	a1, b4, t1	LD	a1, 16 * SIZE(AO)	lda	AO, 16 * SIZE(AO)	ADD	c02, t2, c02	lda	BO,  4 * SIZE(BO)	MUL	a2, b4, t2	LD	a2,  1 * SIZE(AO)	ADD	c03, t3, c03	LD	a4,  3 * SIZE(AO)	MUL	a3, b4, t3	LD	a3,  2 * SIZE(AO)	ADD	c04, t4, c04	MUL	a5, b4, t4	LD	b4,  3 * SIZE(BO)	bgt	L,  $L92	.align 4$L95:	and	K, 3, L	ldt	alpha, ALPHA	unop	ble	L, $L98	.align 4$L96:	ADD	c01, t1, c01	lda	L,   -1(L)	MUL	a1, b1, t1	LD	a1,  4 * SIZE(AO)	ADD	c02, t2, c02	lda	BO,  1 * SIZE(BO)	MUL	a2, b1, t2	LD	a2,  5 * SIZE(AO)	ADD	c03, t3, c03	unop	MUL	a3, b1, t3	LD	a3,  6 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b1, t4	LD	a4,  7 * SIZE(AO)	LD	b1,  0 * SIZE(BO)	lda	AO,  4 * SIZE(AO)	bgt	L,  $L96	.align 4$L98:	ADD	c01, t1, c01	LD	c05,  0 * SIZE(C1)	ADD	c02, t2, c02	LD	c06,  1 * SIZE(C1)	ADD	c03, t3, c03	LD	c07,  2 * SIZE(C1)	ADD	c04, t4, c04	LD	c08,  3 * SIZE(C1)	MUL	alpha, c01, c01	MUL	alpha, c02, c02	MUL	alpha, c03, c03	MUL	alpha, c04, c04	ADD	c01,  c05, c01	fclr	t1	ADD	c02,  c06, c02	fclr	t2	ADD	c03,  c07, c03	lda	I,        -1(I)	unop	fclr	t3	ADD	c04,  c08, c04	unop	fclr	t4	unop	ST	c01,  0 * SIZE(C1)	fclr	c01	ST	c02,  1 * SIZE(C1)	fclr	c02	ST	c03,  2 * SIZE(C1) 	fclr	c03	ST	c04,  3 * SIZE(C1)	fclr	c04	lda	C1,   4 * SIZE(C1)	bgt	I, $L91	.align 4$L100:	and	M,  2, I	unop	unop	ble	I, $L110	.align 4$L101:	LD	a1,  0 * SIZE(AO)	LD	a2,  1 * SIZE(AO)	LD	a3,  2 * SIZE(AO)	LD	a4,  3 * SIZE(AO)	LD	b1,  0 * SIZE(B)	sra	K, 2, L	LD	b2,  1 * SIZE(B)	mov	B, BO	LD	b3,  2 * SIZE(B)	unop	LD	b4,  3 * SIZE(B)	ble	L, $L105	.align	5$L102:	ADD	c01, t1, c01	lda	L,        -1(L)	MUL	a1, b1, t1	LD	a1,  4 * SIZE(AO)	ADD	c02, t2, c02	MUL	a2, b1, t2	LD	a2,  5 * SIZE(AO)	LD	b1,  4 * SIZE(BO)	ADD	c03, t3, c03	lda	BO,    4 * SIZE(BO)	MUL	a3, b2, t3	LD	a3,  6 * SIZE(AO)	ADD	c04, t4, c04	MUL	a4, b2, t4	LD	a5,  7 * SIZE(AO)	LD	b2,  1 * SIZE(BO)	ADD	c01, t1, c01	MUL	a1, b3, t1	LD	a1,  8 * SIZE(AO)	lda	AO,  8 * SIZE(AO)	ADD	c02, t2, c02	MUL	a2, b3, t2	LD	b3,  2 * SIZE(BO)	LD	a2,  1 * SIZE(AO)	ADD	c03, t3, c03	LD	a4,  3 * SIZE(AO)	MUL	a3, b4, t3	LD	a3,  2 * SIZE(AO)	ADD	c04, t4, c04	MUL	a5, b4, t4	LD	b4,  3 * SIZE(BO)	bgt	L,  $L102	.align 4$L105:	and	K, 3, L	ldt	alpha, ALPHA	LD	a3,  0 * SIZE(C1)	LD	a4,  1 * SIZE(C1)	ble	L, $L108	.align 4$L106:	ADD	c01, t1, c01	lda	L,        -1(L)	MUL	a1, b1, t1	LD	a1,  2 * SIZE(AO)	ADD	c02, t2, c02	MUL	a2, b1, t2	LD	a2,  3 * SIZE(AO)	LD	b1,  1 * SIZE(BO)	lda	AO,  2 * SIZE(AO)	unop	lda	BO,  1 * SIZE(BO)	bgt	L,  $L106	.align 4$L108:	ADD	c01, t1, c01	fclr	t1	ADD	c02, t2, c02	fclr	t2	ADD	c03, t3, c03	fclr	t3	ADD	c04, t4, c04	fclr	t4	ADD	c01, c03, c01	ADD	c02, c04, c02	MUL	alpha, c01, c01	MUL	alpha, c02, c02	ADD	c01,  a3, c01 	fclr	c03	ADD	c02,  a4, c02	fclr	c04	ST	c01,  0 * SIZE(C1)	fclr	c01	ST	c02,  1 * SIZE(C1)	fclr	c02	lda	C1,   2 * SIZE(C1)	.align 4$L110:	and	M,  1, I	ble	I, $L999	.align 4$L111:	LD	a1,  0 * SIZE(AO)	LD	a2,  1 * SIZE(AO)	LD	a3,  2 * SIZE(AO)	LD	a4,  3 * SIZE(AO)	LD	b1,  0 * SIZE(B)	LD	b2,  1 * SIZE(B)	LD	b3,  2 * SIZE(B)	LD	b4,  3 * SIZE(B)	sra	K, 2, L	mov	B, BO	unop	ble	L, $L115	.align	4$L112:	ADD	c01, t1, c01	MUL	a1,  b1, t1	LD	a1,  4 * SIZE(AO)	LD	b1,  4 * SIZE(BO)	ADD	c02, t2, c02	MUL	a2,  b2, t2	LD	a2,  5 * SIZE(AO)	LD	b2,  5 * SIZE(BO)	ADD	c03, t3, c03	MUL	a3,  b3, t3	LD	a3,  6 * SIZE(AO)	LD	b3,  6 * SIZE(BO)	ADD	c04, t4, c04	MUL	a4,  b4, t4	LD	a4,  7 * SIZE(AO)	LD	b4,  7 * SIZE(BO)	lda	L,        -1(L)	lda	AO,    4 * SIZE(AO)	lda	BO,    4 * SIZE(BO)	bgt	L,  $L112	.align 4$L115:	and	K, 3, L	ldt	alpha, ALPHA	LD	a2,  0 * SIZE(C1)	ble	L, $L118	.align	4$L116:	ADD	c01, t1, c01	MUL	a1,  b1, t1	LD	a1,  1 * SIZE(AO)	LD	b1,  1 * SIZE(BO)	lda	L,        -1(L)	lda	AO,  1 * SIZE(AO)	lda	BO,  1 * SIZE(BO)	bgt	L,  $L116	.align 4$L118:	ADD	c01, t1, c01	ADD	c02, t2, c02	ADD	c03, t3, c03	ADD	c04, t4, c04	ADD	c01, c02, c01	ADD	c03, c04, c03	ADD	c01, c03, c01	MUL	alpha, c01, c01	ADD	c01,  a2, c01	ST	c01,  0 * SIZE(C1)	.align 4$L999:	ldt	$f2,   0($sp)	ldt	$f3,   8($sp)	ldt	$f4,  16($sp)	ldt	$f5,  24($sp)	ldt	$f6,  32($sp)	ldt	$f7,  40($sp)	ldt	$f8,  48($sp)	ldt	$f9,  56($sp)	clr	$0	lda	$sp, STACKSIZE($sp)	ret	.ident	VERSION	.end	CNAME

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -