📄 zgemm_kernel_2x2.s
字号:
$L20: and M, 1, I ble I, $L29 .align 4$L21: LD a1, 0 * SIZE(AO) fclr c02 LD a2, 1 * SIZE(AO) fclr c06 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(K) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) ble L, $L25 .align 5$L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4$L25: ADD c09, t1, c09 ldt alpha_r, ALPHA_R MUL a1, b1, t1 blbs K, $L28 .align 4 ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4$L28: ADD c10, t2, c10 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD c13, t3, c13 unop MUL a1, b2, t3 LD c03, 0 * SIZE(C1) ADD c14, t4, c14 unop MUL a2, b2, t4 LD c04, 1 * SIZE(C1) ADD c01, t1, c01 unop MUL a1, b3, t1 LD c11, 0 * SIZE(C2) ADD c02, t2, c02 unop MUL a2, b3, t2 LD c12, 1 * SIZE(C2) ADD c05, t3, c05 MUL a1, b4, t3 ADD c06, t4, c06 MUL a2, b4, t4 ADD c09, t1, c09 ADD c10, t2, c10 ADD c13, t3, c13 ADD c14, t4, c14#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) SUB c01, c06, c01 ADD c02, c05, c02 SUB c09, c14, c09 ADD c10, c13, c10#else ADD c01, c06, c01 SUB c02, c05, c02 ADD c09, c14, c09 SUB c10, c13, c10#endif MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_r, c09, t3 MUL alpha_r, c10, t4#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) ADD c03, t1, c03 MUL alpha_i, c02, t1 ADD c04, t2, c04 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c10, t3 ADD c12, t4, c12 MUL alpha_i, c09, t4 SUB c03, t1, c03 ADD c04, t2, c04 SUB c11, t3, c11 ADD c12, t4, c12#else ADD c03, t1, c03 MUL alpha_i, c02, t1 SUB c04, t2, c04 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c10, t3 SUB c12, t4, c12 MUL alpha_i, c09, t4 ADD c03, t1, c03 ADD c04, t2, c04 ADD c11, t3, c11 ADD c12, t4, c12#endif ST c03, 0 * SIZE(C1) ST c04, 1 * SIZE(C1) ST c11, 0 * SIZE(C2) ST c12, 1 * SIZE(C2) .align 4$L29: mov BO, B unop unop bgt J, $L01 .align 4$L30: and N, 1, J mov C, C1 mov A, AO ble J, $L999 .align 4$L40: sra M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 fclr c02 fclr c06 ble I, $L50 .align 4$L41: LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda L, -2(K) lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble L, $L45 .align 5$L42: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L42 .align 4$L45: ADD c05, t1, c05 ldt alpha_r, ALPHA_R MUL b1, a1, t1 blbs K, $L48 .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4$L48: ADD c06, t2, c06 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD c07, t3, c07 lda I, -1(I) MUL a3, b1, t3 LD c09, 0 * SIZE(C1) ADD c08, t4, c08 unop MUL a4, b1, t4 LD c10, 1 * SIZE(C1) ADD c01, t1, c01 unop MUL a1, b2, t1 LD c11, 2 * SIZE(C1) ADD c02, t2, c02 unop MUL a2, b2, t2 LD c12, 3 * SIZE(C1) ADD c03, t3, c03 MUL a3, b2, t3 ADD c04, t4, c04 MUL a4, b2, t4 ADD c05, t1, c05 ADD c06, t2, c06 ADD c07, t3, c07 ADD c08, t4, c08#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) SUB c01, c06, c01 ADD c02, c05, c02 SUB c03, c08, c03 ADD c04, c07, c04#else ADD c01, c06, c01 SUB c02, c05, c02 ADD c03, c08, c03 SUB c04, c07, c04#endif MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_r, c03, t3 MUL alpha_r, c04, t4#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) ADD c09, t1, c09 MUL alpha_i, c02, t1 ADD c10, t2, c10 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c04, t3 ADD c12, t4, c12 MUL alpha_i, c03, t4 SUB c09, t1, c09 fclr t1 ADD c10, t2, c10 fclr t2 SUB c11, t3, c11 fclr t3 ADD c12, t4, c12 fclr t4#else ADD c09, t1, c09 MUL alpha_i, c02, t1 SUB c10, t2, c10 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c04, t3 SUB c12, t4, c12 MUL alpha_i, c03, t4 ADD c09, t1, c09 fclr t1 ADD c10, t2, c10 fclr t2 ADD c11, t3, c11 fclr t3 ADD c12, t4, c12 fclr t4#endif ST c09, 0 * SIZE(C1) fclr c01 ST c10, 1 * SIZE(C1) fclr c02 ST c11, 2 * SIZE(C1) unop fclr c05 unop ST c12, 3 * SIZE(C1) fclr c06 lda C1, 4 * SIZE(C1) bgt I, $L41 .align 4$L50: and M, 1, I ble I, $L999 .align 4$L51: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) lda L, -2(K) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble L, $L55 .align 5$L52: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L52 .align 4$L55: ADD c01, t1, c01 ldt alpha_r, ALPHA_R MUL a1, b1, t1 blbs K, $L58 .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4$L58: ADD c02, t2, c02 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD c05, t3, c05 unop MUL a1, b2, t3 LD c03, 0 * SIZE(C1) ADD c06, t4, c06 unop MUL a2, b2, t4 LD c04, 1 * SIZE(C1) ADD c01, t1, c01 ADD c02, t2, c02 ADD c05, t3, c05 ADD c06, t4, c06#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) SUB c01, c06, c01 ADD c02, c05, c02#else ADD c01, c06, c01 SUB c02, c05, c02#endif MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_i, c02, t3 MUL alpha_i, c01, t4#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) ADD c03, t1, c03 ADD c04, t2, c04 SUB c03, t3, c03 ADD c04, t4, c04#else ADD c03, t1, c03 SUB c04, t2, c04 ADD c03, t3, c03 ADD c04, t4, c04#endif ST c03, 0 * SIZE(C1) ST c04, 1 * SIZE(C1) .align 4$L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret .ident VERSION .end CNAME
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -