📄 s_tanhl.s
字号:
data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03data8 0xB327A435358F1200, 0x00003FF9 // A9 = 2.1869488447622383899199238857e-02data8 0xDD0DD0DD07A0775F, 0x0000BFFA // A7 = -5.3968253967902161405327069187e-02data8 0x888888888887C299, 0x00003FFC // A5 = 1.3333333333333264660338062012e-01data8 0xAAAAAAAAAAAAAA98, 0x0000BFFD // A3 = -3.3333333333333333282255458755e-01LOCAL_OBJECT_END(_0_to_1o8_data).section .textGLOBAL_LIBM_ENTRY(tanhl){ .mfi alloc r32 = ar.pfs, 0, 21, 0, 0 fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0) addl rSignBit = 0x20000, r0 // Set sign bit for exponent}{ .mlx addl rDataPtr = @ltoff(tanhl_data), gp // Get common data ptr movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres.};;{ .mfi getf.exp rArgExp = f8 // Get arg exponent fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf addl rBias = 0xfffc, r0 // Value to subtract from exp // to get actual interval number}{ .mfi ld8 rDataPtr = [rDataPtr] // Get real common data pointer fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path) addl r2to4 = 0x10000, r0 // unbiased exponent // for [2;4] binary interval};;{ .mfi getf.sig rArgSig = f8 // Get arg significand fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive? addl rSaturation = 0xb70, r0 // First 12 bits of // saturation value signif.}{ .mfi setf.d f1p5 = r1p5 // 1.5 construction fma.s1 f2p0 = f1,f1,f1 // 2.0 construction addl r1625Sign = 0xd01, r0 // First 12 bits of // 1.625 value signif. // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0};;{ .mfi addl rTailDataPtr = 0xB00, rDataPtr // Pointer to "tail" data fmerge.s fSignumX = f8, f1 // signum(x) andcm rArgExp = rArgExp, rSignBit // Remove sign of exp}{ .mfb addl rTiny = 0xf000, r0 // Tiny value for saturation path nop.f 0(p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs };;{ .mfi sub rInterval = rArgExp, rBias // Get actual interval number nop.f 0 shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign. }{ .mfi adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data nop.f 0 cmp.ge p8, p10 = rArgExp, r2to4 // If exp >= 2to4 interval?};;{ .mfi(p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater // than 1.625? (arg is at one of binary subranges) nop.f 0 shl rOffset = rInterval, 8 // Make offset from // interval number}{ .mfi cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0 // (means arg is in [0; 1/8]) nop.f 0 cmp.eq p7, p0 = 0x7, rInterval // If arg is in [16;] interv.?};;{ .mfi(p8) adds rOffset = 0x400, rOffset // Add additional offset // (arg is at one of binary subranges) fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path) shl rTailOffset = rInterval, 7 // Make offset to "tail" data // from interval number}{ .mib setf.exp fTiny = rTiny // Construct "tiny" value // for saturation path cmp.ltu p11, p0 = 0x7, rInterval // if arg > 32(p9) br.cond.spnt _0_to_1o8 };;{ .mfi add rAddr1 = rDataPtr, rOffset // Get address for // interval data nop.f 0 shl rTailAddOffset = rInterval, 5 // Offset to interval // "tail" data }{ .mib add rAddr2 = rShiftedDataPtr, rOffset // Get second // address for interval data (p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is // in [22.8;32] interval(p11) br.cond.spnt _saturation // Branch to Saturation path};;{ .mmi ldfe fA3 = [rAddr1], 0x90 // Load A3 ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset};;{ .mmi ldfe fA20 = [rAddr1], 16 // Load A20 ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low(p8) adds rTailOffset = 0x280, rTailOffset // Additional offset // (arg is at one of binary subranges)};;{ .mmi ldfe fA19 = [rAddr1], 16 // Load A19 ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low add rTailAddr1 = rTailDataPtr, rTailOffset // First tail // data address};;.pred.rel "mutex",p8,p10{ .mfi ldfe fA18 = [rAddr1], 16 // Load A18(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0 // (arg is at one of binary subranges) adds rTailAddr2 = 0x10, rTailAddr1 // First tail // data address}{ .mfi ldfe fA25 = [rAddr2], 16 // Load A25 (p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5 // to normalized arg nop.i 0};;{ .mmi ldfe fA17 = [rAddr1], 16 // Load A17 ldfe fA24 = [rAddr2], 16 // Load A24 nop.i 0};;{ .mmi ldfe fA16 = [rAddr1], 16 // Load A16 ldfe fA23 = [rAddr2], 16 // Load A23 nop.i 0};;{ .mmi ldfe fA15 = [rAddr1], 16 // Load A15 ldfe fA22 = [rAddr2], 16 // Load A22 nop.i 0};;{ .mmi ldfe fA14 = [rAddr1], 16 // Load A14 ldfe fA21 = [rAddr2], 16 // Load A21 nop.i 0};;{ .mfi ldfe fA13 = [rTailAddr1], 32 // Load A13 fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2 nop.i 0}{ .mfi ldfe fA12 = [rTailAddr2], 32 // Load A12 nop.f 0 nop.i 0};;{ .mfi ldfe fA11 = [rTailAddr1], 32 // Load A11 fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2 nop.i 0}{ .mfi ldfe fA10 = [rTailAddr2], 32 // Load A10 fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2 nop.i 0};;{ .mfi ldfe fA9 = [rTailAddr1], 32 // Load A9 fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0 nop.i 0}{ .mfi ldfe fA8 = [rTailAddr2], 32 // Load A8 nop.f 0 nop.i 0};;{ .mmi ldfe fA7 = [rTailAddr1], 32 // Load A7 ldfe fA6 = [rTailAddr2], 32 // Load A6 nop.i 0};;{ .mmi ldfe fA5 = [rTailAddr1], 32 // Load A5 ldfe fA4 = [rTailAddr2], 32 // Load A4 nop.i 0};;{ .mfi nop.m 0 fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2 // Low part of x^2 (delta) nop.i 0}{ .mfi nop.m 0 fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4 nop.i 0};;{ .mfi nop.m 0 fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2 nop.i 0};;{ .mfi nop.m 0 fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3 nop.i 0}{ .mfi nop.m 0 fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail nop.i 0}{ .mfi nop.m 0 fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2 nop.i 0}{ .mfi nop.m 0 fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0 nop.i 0}{ .mfi nop.m 0 fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail nop.i 0}{ .mfi nop.m 0 fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail nop.i 0}{ .mfi nop.m 0 fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8 nop.i 0}{ .mfi nop.m 0 fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2 nop.i 0}{ .mfi nop.m 0 fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail nop.i 0};;{ .mfi nop.m 0 fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0 nop.i 0}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -