📄 sgemv_n.s
字号:
(p14) FMA f104 = f15, f78, f104 (p14) FMA f105 = f15, f79, f105 (p15) FMA f106 = f15, f87, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE (p6) br.cond.dptk .L11 ;; .align 16.L20: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 2 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L30 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd AO4 = LDA, 1, AO2 } ;; { .mmi LDFD f10 = [X], INCX (p8) LDFD f81 = [AO2], 1 * SIZE shladd AO3 = LDA, 1, A } ;; { .mmi LDFD f11 = [X], INCX (p8) LDFD f82 = [AO3], 1 * SIZE } ;; { .mfi (p8) LDFD f83 = [AO4], 1 * SIZE FMPY f8 = ALPHA, f8 adds PREB = RPREFETCH * SIZE, YLD1 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 } ;; FMPY f10 = ALPHA, f10 shladd A = LDA, 2, A FMPY f11 = ALPHA, f11 ;; { .mfi adds RPRE3 = RPREFETCH * SIZE, AO3 (p8) FMA f106 = f8, f80, f106 mov ar.ec= 2 } ;; adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMA f106 = f9, f81, f106 shr I = MM, 3 ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 (p8) FMA f106 = f10, f82, f106 } ;; { .mfi adds I = -1, I (p8) FMA f106 = f11, f83, f106 tbit.nz p13, p0 = MM, 2 } ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L25 } ;; .align 16.L22: { .mfi (p17) LDFPD f63, f64 = [AO4], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 (p16) adds I = -1, I } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mmf (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFPD f46, f47 = [AO2], 2 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f101 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f17 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f56, f57 = [AO4], 2 * SIZE (p17) FMA f18 = f11, f59, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f58, f59 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f61, f113 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f60, f61 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f63, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f11, f64, f122 br.ctop.sptk.few .L22 } ;; .align 16.L25: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmf (p15) LDFD f81 = [AO2] (p15) LDFD f82 = [AO3] (p13) FMA f100 = f8, f32, f100 } { .mfi (p18) STFD [YST1] = f23, 1 * SIZE (p13) FMA f101 = f8, f33, f101 } ;; ;; { .mfi (p13) LDFPD f38, f39 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 } { .mfi (p13) FMA f103 = f8, f49, f103 } ;; { .mfi (p13) LDFPD f54, f55 = [AO4], 2 * SIZE (p14) FMA f104 = f8, f64, f104 } { .mfi (p14) FMA f105 = f8, f65, f105 } ;; { .mfi (p14) LDFPD f70, f71 = [AO4], 2 * SIZE (p15) FMA f106 = f8, f80, f106 } { .mfi (p13) FMA f100 = f9, f34, f100 } ;; { .mfi (p15) LDFD f83 = [AO4] (p13) FMA f101 = f9, f35, f101 } { .mfi (p13) FMA f102 = f9, f50, f102 } ;; (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) FMA f100 = f10, f36, f100 (p13) FMA f101 = f10, f37, f101 (p13) FMA f102 = f10, f52, f102 (p13) FMA f103 = f10, f53, f103 (p14) FMA f104 = f10, f68, f104 (p14) FMA f105 = f10, f69, f105 (p15) FMA f106 = f10, f82, f106 ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 ;; (p13) FMA f102 = f11, f54, f102 (p13) STFD [YST1] = f100, 1 * SIZE (p13) FMA f103 = f11, f55, f103 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f104 = f11, f70, f104 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p14) FMA f105 = f11, f71, f105 ;; (p13) STFD [YST1] = f103, 1 * SIZE (p15) FMA f106 = f11, f83, f106 ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16.L30: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 1 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L40 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd A = LDA, 1, A } ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA, f8 mov ar.ec= 2 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 shr I = MM, 3 ;; (p8) LDFD f81 = [AO2], 1 * SIZE cmp.eq p6, p0 = 0, I ;; (p8) FMA f106 = f8, f80, f106 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 tbit.nz p13, p0 = MM, 2 ;; (p8) FMA f106 = f9, f81, f106 cmp.eq p16, p0 = r0, r0 adds I = -1, I ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L35 } ;; .align 16.L32: { .mfi (p17) LDFPD f47, f48 = [AO2], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mmf (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 adds I = -1, I } { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mmf (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f16 = f9, f41, f101 } { .mmf (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f17 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p17) FMA f18 = f9, f43, f107 } { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f45, f113 } { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f47, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f9, f48, f122 br.ctop.sptk.few .L32 } ;; .align 16.L35: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmi (p15) LDFD f81 = [AO2] (p18) STFD [YST1] = f23, 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) FMA f100 = f9, f34, f100 (p13) FMA f101 = f9, f35, f101 (p13) FMA f102 = f9, f50, f102 (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16.L40: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 0 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L990 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE adds RPRE1 = RPREFETCH * SIZE, AO1 } ;; { .mii (p8) LDFD f80 = [AO1], 1 * SIZE adds PREB = RPREFETCH * SIZE, YLD1 } ;; FMPY f8 = ALPHA, f8 shr I = MM, 3 ;; (p8) FMA f106 = f8, f80, f106 mov ar.ec= 3 ;; { .mmi cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 tbit.nz p14, p15 = r0, 0 } ;; { .mmi adds YST2 = 4 * SIZE, YST1 adds I = -1, I tbit.nz p13, p0 = MM, 2 } ;; { .mmi (p8) STFD [YST1] = f106, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 } { .mib mov ar.lc = I (p6) br.cond.dpnt .L145 } ;; .align 16.L42: { .mmf (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE (p18) FMA f16 = f8, f34, f102 } { .mmf (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) FMA f20 = f8, f46, f114 } ;; { .mmf (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE (p18) FMA f17 = f8, f37, f105 } { .mmf (p16) LDFPD f38, f41 = [AO1], 2 * SIZE (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) FMA f21 = f8, f49, f117 } ;; { .mmf (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE (p18) FMA f18 = f8, f40, f108 } { .mmf (p16) LDFPD f44, f47 = [AO1], 2 * SIZE (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) FMA f22 = f8, f52, f120 } ;; { .mmf (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE (p18) FMA f19 = f8, f43, f111 } { .mmf (p16) LDFPD f50, f53 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) FMA f23 = f8, f55, f123 } ;; { .mmi (p14) PREFETCH [RPRE1], 16 * SIZE (p14) PREFETCH [PREB], 16 * SIZE (p16) tbit.nz.unc p14, p15 = I, 0 } { .mib nop __LINE__ (p16) adds I = -1, I br.ctop.sptk.few .L42 } ;; .align 16.L45: { .mmi (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE } { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 ;; (p13) STFD [YST1] = f100, 1 * SIZE (p14) FMA f104 = f8, f64, f104 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f105 = f8, f65, f105 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p15) FMA f106 = f8, f80, f106 ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE br .L990 ;; .align 16.L100: shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 ;; .align 16.L111: mov YLD1 = YY mov YST1 = YY ;; LDFD f8 = [X], INCX ;; LDFD f9 = [X], INCX ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -