📄 zgemv_t.s
字号:
STFD [CST2] = f37 FMA f41 = ALPHA_R, f17, f41 } { .mmf add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 FMA f45 = ALPHA_R, f21, f45 } ;; { .mmf STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE FNMA f42 = ALPHA_I, f19, f42 } { .mmf nop __LINE__ nop __LINE__ FNMA f46 = ALPHA_I, f23, f46 } ;; { .mmf STFD [CST1] = f35 STFD [CST2] = f39 FMA f43 = ALPHA_R, f19, f43 } { .mmf add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 FMA f47 = ALPHA_R, f23, f47 } ;; { .mmi STFD [CST1] = f40, SIZE STFD [CST2] = f44, SIZE adds J = -1, J } ;; { .mmi STFD [CST1] = f41 STFD [CST2] = f45 add CST1 = CST1, INCYM1 } { .mmi nop __LINE__ nop __LINE__ add CST2 = CST2, INCYM1 } ;; { .mmi STFD [CST1] = f42, SIZE STFD [CST2] = f46, SIZE cmp.lt p6, p0 = 0, J } ;; { .mmi STFD [CST1] = f43 STFD [CST2] = f47 add CST1 = CST1, INCY3M1 } { .mmb add CST2 = CST2, INCY3M1 (p6) br.cond.dptk .L11 } ;; .align 16.L20: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 2 } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mfb mov BO = BUFFER mov f14 = f0 (p6) br.cond.dpnt .L30 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 adds I = -1, MIN_M mov f11 = f0 } ;; { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f13 = f0 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 2, A mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] adds PREB = RPREFETCH * SIZE, BO mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16.L26: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 2, I nop __LINE__ (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) cmp.eq.unc p14, p0 = 4, I (p16) cmp.eq.unc p15, p0 = 6, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p16) LDFPD f42, f47 = [AO2], 2 * SIZE nop __LINE__ (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE nop __LINE__ (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p16) LDFPD f52, f57 = [AO3], 2 * SIZE nop __LINE__ (p20) ADD3 f8 = f121, f41, f8 } { .mmf (p16) adds I = 1, I nop __LINE__ (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE nop __LINE__ (p20) ADD3 f10 = f121, f51, f10 } { .mmf (p16) cmp.eq.unc p15, p0 = 8, I nop __LINE__ (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p16) LDFPD f62, f67 = [AO4], 2 * SIZE nop __LINE__ (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p15) mov I = 0 nop __LINE__ (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f71, f14 } { .mfb (p16) cmp.eq.unc p12, p0 = 0, I (p20) ADD4 f15 = f116, f71, f15 br.ctop.sptk.few .L26 } ;;.L28: LDFD f32 = [CLD1], SIZE LDFD f36 = [CLD2], SIZE shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 LDFD f37 = [CLD2], INCYM1 ;; LDFD f34 = [CLD1], SIZE LDFD f38 = [CLD2], SIZE ;; LDFD f35 = [CLD1], INCY3M1 LDFD f39 = [CLD2], INCY3M1 ;; FMA f32 = ALPHA_R, f8, f32 FMA f36 = ALPHA_R, f12, f36 FMA f33 = ALPHA_I, f8, f33 FMA f37 = ALPHA_I, f12, f37 FMA f34 = ALPHA_R, f10, f34 FMA f38 = ALPHA_R, f14, f38 FMA f35 = ALPHA_I, f10, f35 FMA f39 = ALPHA_I, f14, f39 ;; FNMA f32 = ALPHA_I, f9, f32 FNMA f36 = ALPHA_I, f13, f36 FMA f33 = ALPHA_R, f9, f33 FMA f37 = ALPHA_R, f13, f37 FNMA f34 = ALPHA_I, f11, f34 FNMA f38 = ALPHA_I, f15, f38 FMA f35 = ALPHA_R, f11, f35 FMA f39 = ALPHA_R, f15, f39 ;; STFD [CST1] = f32, SIZE STFD [CST2] = f36, SIZE ;; STFD [CST1] = f33 STFD [CST2] = f37 add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 ;; STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE ;; STFD [CST1] = f35 STFD [CST2] = f39 add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 ;; .align 16.L30: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 1 } ;; { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f12 = f0 } { .mfb adds I = -1, MIN_M mov f14 = f0 (p6) br.cond.dpnt .L40 } ;; { .mfi mov BO = BUFFER mov f9 = f0 mov ar.ec= 5 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 1, A mov f11 = f0 } ;; { .mfi adds WPRE = 16 * SIZE, CLD1 mov f13 = f0 mov ar.lc = I } { .mmf adds PREB = RPREFETCH * SIZE, BO nop __LINE__ mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16.L36: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 4, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) cmp.eq.unc p12, p0 = 8, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFPD f42, f47 = [AO2], 2 * SIZE (p20) ADD3 f12 = f121, f41, f12 } { .mmf (p12) mov I = 0 (p20) ADD4 f13 = f116, f41, f13 } ;; { .mmf (p20) ADD3 f14 = f121, f51, f14 } { .mfb nop __LINE__ (p20) ADD4 f15 = f116, f51, f15 br.ctop.sptk.few .L36 } ;;.L38: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f12 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f10 = f10, f14 ;; LDFD f34 = [CLD1], SIZE FADD f9 = f9, f13 ;; LDFD f35 = [CLD1], INCYM1 FADD f11 = f11, f15 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 FMA f34 = ALPHA_R, f10, f34 FMA f35 = ALPHA_I, f10, f35 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 FNMA f34 = ALPHA_I, f11, f34 FMA f35 = ALPHA_R, f11, f35 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 ;; STFD [CST1] = f34, SIZE ;; STFD [CST1] = f35 add CST1 = CST1, INCYM1 ;; .align 16.L40: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi mov f9 = f0 tbit.z p6, p0 = N, 0 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f10 = f0 mov ar.ec= 5 } { .mfb adds I = -1, MIN_M mov f11 = f0 (p6) br.cond.dpnt .L99 } ;; { .mmi cmp.eq p16, p0 = r0, r0 add A = LDA, A mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov BO = BUFFER } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16.L46: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p12, p0 = 7, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD3 f10 = f121, f41, f10 } { .mfb (p12) mov I = 0 (p20) ADD4 f11 = f116, f41, f11 br.ctop.sptk.few .L46 } ;;.L48: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f10 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f9 = f9, f11 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 ;; .align 16.L99: adds IS = P, IS shladd A = LDAP, ZBASE_SHIFT, A ;; cmp.gt p6, p0 = M, IS (p6) br.cond.dptk .LIs_loop br .L999 .align 16 ;;.L100: { .mmi mov CLD1 = Y shladd CLD2 = INCY, 1, Y shr J = N, 3 } ;; { .mmb mov CST1 = Y cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 } ;; .align 16.L111: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 mov BO = BUFFER } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f14 = f0 } ;; { .mmf shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 mov f16 = f0 } { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f18 = f0 } ;; { .mmf shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 mov f20 = f0 } { .mmf adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 mov f22 = f0 } ;; { .mfi shladd A = LDA, 3, A mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 mov f11 = f0 } ;; { .mmf adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov f13 = f0 } { .mmf adds I = -1, MIN_M cmp.eq p16, p0 = r0, r0 mov f15 = f0 } ;; { .mfi cmp.eq p12, p0 = r0, r0 mov f17 = f0 mov ar.lc = I } { .mmf nop __LINE__ nop __LINE__ mov f19 = f0 } ;; { .mmf lfetch.excl.nt1 [WPRE] nop __LINE__ mov f21 = f0 } { .mmf mov I = 0 nop __LINE__ mov f23 = f0 } ;; .align 16.L116: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFD f32 = [AO1], 1 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 1, I (p16) cmp.eq.unc p14, p0 = 2, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p13) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) LDFD f37 = [AO1], 1 * SIZE (p16) cmp.eq.unc p15, p0 = 3, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f42 = [AO2], 1 * SIZE (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p16) LDFD f47 = [AO2], 1 * SIZE nop __LINE__ (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFD f52 = [AO3], 1 * SIZE (p20) ADD3 f8 = f121, f41, f8 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p16) LDFD f57 = [AO3], 1 * SIZE nop __LINE__ (p20) ADD3 f10 = f121, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f62 = [AO4], 1 * SIZE (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p16) cmp.eq.unc p12, p0 = 4, I (p16) cmp.eq.unc p13, p0 = 5, I (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p16) LDFD f67 = [AO4], 1 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f71, f14 }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -