⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 s_tanh.s

📁 glibc 2.9,最新版的C语言库函数
💻 S
📖 第 1 页 / 共 3 页
字号:
data8 0xE93FB2F482327C19, 0x0000BFF7 //A4//// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 data8 0xEB189B71ADC40BE2, 0x00003FEA //A19data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18data8 0xBB061CDD9F368B9D, 0x00003FEC //A17data8 0x841E08BDF5429991, 0x0000BFEC //A16data8 0xDD33990B433F25BE, 0x00003FED //A15data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14data8 0xA71D489AAA6DACF0, 0x00003FEF //A13data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9data8 0xC465A74B798E5761, 0x0000BFF1 //A8data8 0xC4666152397D15C1, 0x00003FF1 //A7data8 0xABD9E63CA575B950, 0x0000BFF1 //A6data8 0x80E38B18E8D0F460, 0x00003FF1 //A5data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4//// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19data8 0xE2834E2D68C1128C, 0x00003FEA //A18data8 0x97B117611B317379, 0x00003FEB //A17data8 0xEE91A0D39A772F6B, 0x00003FEA //A16data8 0x92F6EC377DCADA4F, 0x00003FEA //A15data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13data8 0xC3C659704A7147CD, 0x00003FE2 //A12data8 0xFA17F09D27C97912, 0x00003FE4 //A11data8 0xF664147182B94788, 0x0000BFE3 //A10data8 0xA6C89FA741464DA1, 0x00003FE3 //A9data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8data8 0xB973AE0FD86EC024, 0x00003FE1 //A7data8 0xA23A087F96846951, 0x0000BFE0 //A6data8 0xF358D8A7FC012D5D, 0x00003FDE //A5data8 0x98176E2309B7C73A, 0x0000BFDD //A4//// Coefficients ##16..19 ("tail" coefficient tables)// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2data8 0xF0A4D02960B60E69, 0x00003FFC //A1data8 0xFACBF534D0E42F8A, 0x00003FFC //A0//// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3data8 0xBA13A076BF8E812F, 0x0000BFFB //A2data8 0xC954A37D1A1CA070, 0x00003FFD //A1data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0//// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 data8 0xD42E9175A6EA1397, 0x00003FFB //A3data8 0xA3C361378A55CF56, 0x0000BFFD //A2data8 0xD706E07CC8622983, 0x00003FFD //A1data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25data8 0xAC7A7F8776817C7E, 0x00003FFD //A3data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2data8 0x90B161317028D995, 0x00003FFC //A1data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5data8 0xE9E072407BC22DC6, 0x00003FFA //A3data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1data8 0xFFD40B84505A10B2, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2data8 0xF1AADA46AD341C34, 0x00003FEC //A1data8 0xFFFFFC39548FC34B, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625data8 0x98176FD1F0950C16, 0x00003FDE //A3data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2data8 0xE42327BB0B154F13, 0x00003FD6 //A1data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0//// Binary subranges// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0data8 0xE9E072404329293B, 0x00003FF7 //A3data8 0xAFA4A913D798300B, 0x0000BFF7 //A2data8 0xAFC2D6A885B48567, 0x00003FF6 //A1data8 0xFFD40B84505A10B4, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0data8 0xA11C8A63815F7A28, 0x00003FEF //A3data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2data8 0xF1AADA46E799831F, 0x00003FEB //A1data8 0xFFFFFC39548FC348, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0data8 0x98176FE982140A59, 0x00003FDB //A3data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2data8 0xE42327BB13076BD6, 0x00003FD5 //A1data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25 // ('tanh_near_zero' path)data8 0xBF2BA5D26E479D0C //A9data8 0x3F4336D96F81EE26 //A8data8 0xBF8226E34AE197B0 //A5data8 0x3F9664F488148657 //A4data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1data8 0xBF57D91925BB5EE2 //A7data8 0x3F6D6D36C3D5B7A1 //A6data8 0xBFABA1BA1BA19D32 //A3data8 0x3FC1111111111108 //A2//// 1.0 - 2^(-63)// ('tanh_saturation' path)data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE LOCAL_OBJECT_END(tanh_data)// CAUTION: The order of table coefficients shouldn't be changed!.section .textGLOBAL_LIBM_ENTRY(tanh){ .mfi      alloc          r32         = ar.pfs, 0, 20, 0, 0      fmerge.se      fArgAbsNorm = f1, f8         // normalized x      adds           rSignBit    = 0x1, r0        // Bit for sign removing}{ .mfi      addl           rDataPtr    = @ltoff(tanh_data), gp // Data pointer      fma.s1         fTwo        = f1, f1, f1            // 2.0 construct      addl           rArgSgnd    = 0xfff, r0             // mask for exponent};;{ .mfi      getf.d         rArg        = f8       // x in GR       fclass.m       p6,p0       = f8, 0xEF // Filter 0, denormals and specials                             // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf      shl            rArgSgnd    = rArgSgnd, 52  // mask for exponent}{ .mlx      ld8            rDataPtr    = [rDataPtr]        // Real data pointer      movl           r1625Sgnd   = 0xA000000000000   // 1.625 signd      // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0      // to enter binary subranges};;{ .mfi      addl           rBias       = 0x3FD00, r0       // bias of 0.25 << 8      fma.s1         fArgSqr     = f8, f8, f0        // x^2      shl            rSignBit    = rSignBit, 63      // mask for sign bit}{ .mlx      addl           rMask       = 0x7FF00, r0          // Mask for index bits      movl           rTwo        = 0x4000000000000000   // 2.0};;{ .mfi      andcm          rArgSgnd    = rArg, rArgSgnd // Remove exponent      nop.f          0      shr.u          rShiftedArg = rArg, 44 // Select only necessary bits of arg}{ .mfb      andcm          rAbsArg     = rArg, rSignBit     // Remove sign      nop.f          0(p6)  br.cond.spnt   _tanh_spec    // Branch to zero, denorm & specs};;   { .mfi      and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8      fmerge.s       fArgAbs     = f1, f8                   // |x|      shr            rShiftedAbsArg    = rAbsArg, 44 // Select only necessary                                                      // bits of absolute arg}{ .mfi      cmp.gt         p8, p11     = rArgSgnd, r1625Sgnd // p8 = 1 if      // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0      nop.f          0      nop.i          0};;{ .mfi      sub            rIndex      = rShiftedArgMasked, rBias // index << 8      nop.f          0       cmp.lt         p10, p0     = rShiftedArgMasked, rBias // p10=1 if |x|<0.25}{ .mfb(p8)  cmp.gt         p8, p11     = rAbsArg, rTwo // If arg is greater than 2.0?                                       // (then we should use binary subranges)      nop.f          0 (p10) br.cond.spnt   tanh_near_zero    // branch out if |x| < 0.25};;.pred.rel "mutex",p8,p11{ .mfi(p8)  add            rIndex      = 0x400, rIndex // Make pointer to binary                                                  // subranges(p11) fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1     // |x|/b - 1.0      addl           rSaturation = 0x40331, r0 // shifted bits of 19.0625}{ .mfi      nop.m          0 (p8)  fms.s1         fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0       // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]      nop.i          0 };;{ .mfi      add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14      nop.f          0      nop.i          0};;{ .mfi      adds           rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs      fmerge.s       fSignumX    = f8, f1          // signum(x)      nop.i          0} { .mfb      cmp.le         p12, p0     = rSaturation, rShiftedAbsArg // |x|>=19.0625?      nop.f          0(p12) br.cond.spnt   tanh_saturation          // branch out if x |x| >= 19.0625};;{.mfi      ldfe           fA19        = [rCoeffAddr1], 32 // Load A19      nop.f          0      nop.i          0}{.mfi      ldfe           fA18        = [rCoeffAddr2], 32 // Load A18      nop.f          0      adds           rCoeffAddr3 = 0xA00, rDataPtr   // Pointer to "tail"                                                     // coefficients tables};;{.mfi      ldfe           fA17        = [rCoeffAddr1], 32 // Load A17      nop.f          0      nop.i          0}{.mfi      ldfe           fA16        = [rCoeffAddr2], 32 // Load A16      nop.f          0      nop.i          0};;{.mfi      ldfe           fA15        = [rCoeffAddr1], 32 // Load A15      fma.s1         fTSqr       = fArgAbsNorm, fArgAbsNorm, f0 // x^2      shr.u          rIndex      = rIndex, 2 // Index for "tail" tables}{.mfi      ldfe           fA14        = [rCoeffAddr2], 32 // Load A14      nop.f          0      adds           rCoeffAddr4 = 16, r0            // Shifter pointer                                                     // to "tail" tables};;{.mfi      ldfe           fA13        = [rCoeffAddr1], 32   // Load A13      nop.f          0      add            rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load                                                       // ##16..23}{.mfi      ldfe           fA12        = [rCoeffAddr2], 32 // Load A12      nop.f          0      cmp.lt         p15, p14    = rArg, r0          // Arg positive (p14)                                                      // or negative (p15)?};;{.mfi      ldfe           fA11        = [rCoeffAddr1], 32        // Load A11      nop.f          0      add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"                                                             // coeffs to load }{.mfi      ldfe           fA10        = [rCoeffAddr2], 32 // Load A10      nop.f          0      nop.i          0};;{.mfi      ldfe           fA9         = [rCoeffAddr1], 32 // Load A9      nop.f          0      nop.i          0}{.mfi      ldfe           fA8         = [rCoeffAddr2], 32 // Load A8      nop.f          0      nop.i          0};;{.mfi      ldfe           fA7         = [rCoeffAddr1], 32 // Load A7      nop.f          0      nop.i          0}{.mfi      ldfe           fA6         = [rCoeffAddr2], 32 // Load A6      nop.f          0      nop.i          0};;{.mfi      ldfe           fA5         = [rCoeffAddr1], 32 // Load A5      fma.s1         fTDeg3      = fArgAbsNorm, fTSqr, f0 // x^3      nop.i          0}{.mfi      ldfe           fA4         = [rCoeffAddr2], 32 // Load A4      fma.s1         fTQuadr     = fTSqr, fTSqr, f0  // x^4      nop.i          0};;// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm{.mfi      ldfe           fA3         = [rCoeffAddr3], 32            // Load A3      fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x      nop.i          0}{.mfi      ldfe           fA2         = [rCoeffAddr4], 32            // Load A2      nop.f          0      nop.i          0};;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -