📄 s_tanh.s
字号:
{.mfi ldfe fA1 = [rCoeffAddr3], 32 // Load A1 fma.s1 fRes = fA19, fArgAbsNorm, fA18 // Polynomial nop.i 0}{.mfi ldfe fA0 = [rCoeffAddr4], 32 // Load A0 nop.f 0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fA17 = fA17, fArgAbsNorm, fA16 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA15 = fA15, fArgAbsNorm, fA14 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fA13 = fA13, fArgAbsNorm, fA12 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA11 = fA11, fArgAbsNorm, fA10 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA9 = fA9, fArgAbsNorm, fA8 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTSqr, fA17 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fA7 = fA7, fArgAbsNorm, fA6 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA5 = fA5, fArgAbsNorm, f0 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fA4 = fA4, fArgAbsNorm, fA3 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA2 = fA2, fArgAbsNorm, fA1 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA11 = fA11, fTSqr, fA9 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTQuadr, fA11 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTDeg7, fA4 // Polynomial nop.i 0};;{ .mfi nop.m 0 // result for negative argument(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial nop.i 0}{ .mfb nop.m 0 // result for positive argument(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial br.ret.sptk b0};;// |x| < 0.25 Path /////////////////////////////////////////////////////////////.align 32tanh_near_zero:{ .mfi adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9 fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4 nop.i 0}{ .mfi adds rCoeffAddr2 = 0xCB0, rDataPtr // address of A7 nop.f 0 nop.i 0};;{ .mfi ldfpd fA9, fA8 = [rCoeffAddr1], 16 // Load A9, A8 nop.f 0 nop.i 0}{ .mfi ldfpd fA7, fA6 = [rCoeffAddr2], 16 // Load A7, A6 nop.f 0 nop.i 0};;{ .mfi ldfpd fA5, fA4 = [rCoeffAddr1], 16 // Load A5, A4 nop.f 0 nop.i 0}{ .mfi ldfpd fA3, fA2 = [rCoeffAddr2], 16 // Load A3, A2 nop.f 0 nop.i 0};;{ .mfi ldfe fA1 = [rCoeffAddr1] // Load A1 nop.f 0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fA9, fArgSqr, fA8 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fA7 = fA7, fArgSqr, fA6 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA3 = fA3, fArgSqr, fA2 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fA5 = fA5, fArgSqr, fA4 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA1 = fA1, fArgSqr, f0 // Polynomial nop.i 0}{ .mfi nop.m 0 fma.s1 fTQuadrSgn = fTQuadr, f8, f0 // x^4 * x nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTSqr, fA7 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fA1 = fA3, fTSqr, fA1 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTSqr, fA5 // Polynomial nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes = fRes, fTQuadr, fA1 // Polynomial nop.i 0};;{ .mfb nop.m 0 fma.d.s0 f8 = fRes, f8, f8 // x+x*Polynomial br.ret.sptk b0 // Exit for |x| < 0.25};;// 19.0625 <= |x| < +inf Saturation path ///////////////////////////////////////.align 32tanh_saturation:{ .mfi adds rDataPtr = 0xCD0, rDataPtr // address of A0 nop.f 0 nop.i 0};;{ .mfi ldfe fA0 = [rDataPtr] // Load A0 = 2^(-63) nop.f 0 nop.i 0};;{ .mfb nop.m 0 fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63)) br.ret.sptk b0 // Exit for 19.0625 <=|x|< +inf};; // 0, denormals and special IEEE numbers path /////////////////////////////////_tanh_spec:{ .mfi cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15) // or positive p14) fclass.m p6,p0 = f8, 0x23 // To filter infinities // 0x23 = @pos|@neg|@inf nop.i 0};;{ .mfi nop.m 0 fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros // 0xC7 = @pos|@neg|@zero|@qnan|@snan nop.i 0};;{ .mfb nop.m 0(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args (p6) br.ret.spnt b0 // exit for x = INF};;{ .mfb nop.m 0(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args // and NaNs for NaNs(p7) br.ret.spnt b0 // exit for x = NaN or +/-0};;{ .mfi nop.m 0 fnorm.s0 f8 = f8 // Normalize arg nop.i 0};;.pred.rel "mutex",p14,p15{ .mfi nop.m 0(p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2 nop.i 0}{ .mfb nop.m 0(p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2 br.ret.sptk b0 // 0, denormals, specials return};;GLOBAL_LIBM_END(tanh)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -