📄 s_tanh.s
字号:
data8 0xE93FB2F482327C19, 0x0000BFF7 //A4//// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 data8 0xEB189B71ADC40BE2, 0x00003FEA //A19data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18data8 0xBB061CDD9F368B9D, 0x00003FEC //A17data8 0x841E08BDF5429991, 0x0000BFEC //A16data8 0xDD33990B433F25BE, 0x00003FED //A15data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14data8 0xA71D489AAA6DACF0, 0x00003FEF //A13data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9data8 0xC465A74B798E5761, 0x0000BFF1 //A8data8 0xC4666152397D15C1, 0x00003FF1 //A7data8 0xABD9E63CA575B950, 0x0000BFF1 //A6data8 0x80E38B18E8D0F460, 0x00003FF1 //A5data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4//// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0 data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19data8 0xE2834E2D68C1128C, 0x00003FEA //A18data8 0x97B117611B317379, 0x00003FEB //A17data8 0xEE91A0D39A772F6B, 0x00003FEA //A16data8 0x92F6EC377DCADA4F, 0x00003FEA //A15data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13data8 0xC3C659704A7147CD, 0x00003FE2 //A12data8 0xFA17F09D27C97912, 0x00003FE4 //A11data8 0xF664147182B94788, 0x0000BFE3 //A10data8 0xA6C89FA741464DA1, 0x00003FE3 //A9data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8data8 0xB973AE0FD86EC024, 0x00003FE1 //A7data8 0xA23A087F96846951, 0x0000BFE0 //A6data8 0xF358D8A7FC012D5D, 0x00003FDE //A5data8 0x98176E2309B7C73A, 0x0000BFDD //A4//// Coefficients ##16..19 ("tail" coefficient tables)// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5 data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2data8 0xF0A4D02960B60E69, 0x00003FFC //A1data8 0xFACBF534D0E42F8A, 0x00003FFC //A0//// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0 data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3data8 0xBA13A076BF8E812F, 0x0000BFFB //A2data8 0xC954A37D1A1CA070, 0x00003FFD //A1data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0//// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0 data8 0xD42E9175A6EA1397, 0x00003FFB //A3data8 0xA3C361378A55CF56, 0x0000BFFD //A2data8 0xD706E07CC8622983, 0x00003FFD //A1data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25data8 0xAC7A7F8776817C7E, 0x00003FFD //A3data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2data8 0x90B161317028D995, 0x00003FFC //A1data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5data8 0xE9E072407BC22DC6, 0x00003FFA //A3data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1data8 0xFFD40B84505A10B2, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2data8 0xF1AADA46AD341C34, 0x00003FEC //A1data8 0xFFFFFC39548FC34B, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625data8 0x98176FD1F0950C16, 0x00003FDE //A3data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2data8 0xE42327BB0B154F13, 0x00003FD6 //A1data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0//// Binary subranges// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0data8 0xE9E072404329293B, 0x00003FF7 //A3data8 0xAFA4A913D798300B, 0x0000BFF7 //A2data8 0xAFC2D6A885B48567, 0x00003FF6 //A1data8 0xFFD40B84505A10B4, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0data8 0xA11C8A63815F7A28, 0x00003FEF //A3data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2data8 0xF1AADA46E799831F, 0x00003FEB //A1data8 0xFFFFFC39548FC348, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0data8 0x98176FE982140A59, 0x00003FDB //A3data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2data8 0xE42327BB13076BD6, 0x00003FD5 //A1data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0//// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25 // ('tanh_near_zero' path)data8 0xBF2BA5D26E479D0C //A9data8 0x3F4336D96F81EE26 //A8data8 0xBF8226E34AE197B0 //A5data8 0x3F9664F488148657 //A4data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1data8 0xBF57D91925BB5EE2 //A7data8 0x3F6D6D36C3D5B7A1 //A6data8 0xBFABA1BA1BA19D32 //A3data8 0x3FC1111111111108 //A2//// 1.0 - 2^(-63)// ('tanh_saturation' path)data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE LOCAL_OBJECT_END(tanh_data)// CAUTION: The order of table coefficients shouldn't be changed!.section .textGLOBAL_LIBM_ENTRY(tanh){ .mfi alloc r32 = ar.pfs, 0, 20, 0, 0 fmerge.se fArgAbsNorm = f1, f8 // normalized x adds rSignBit = 0x1, r0 // Bit for sign removing}{ .mfi addl rDataPtr = @ltoff(tanh_data), gp // Data pointer fma.s1 fTwo = f1, f1, f1 // 2.0 construct addl rArgSgnd = 0xfff, r0 // mask for exponent};;{ .mfi getf.d rArg = f8 // x in GR fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf shl rArgSgnd = rArgSgnd, 52 // mask for exponent}{ .mlx ld8 rDataPtr = [rDataPtr] // Real data pointer movl r1625Sgnd = 0xA000000000000 // 1.625 signd // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0 // to enter binary subranges};;{ .mfi addl rBias = 0x3FD00, r0 // bias of 0.25 << 8 fma.s1 fArgSqr = f8, f8, f0 // x^2 shl rSignBit = rSignBit, 63 // mask for sign bit}{ .mlx addl rMask = 0x7FF00, r0 // Mask for index bits movl rTwo = 0x4000000000000000 // 2.0};;{ .mfi andcm rArgSgnd = rArg, rArgSgnd // Remove exponent nop.f 0 shr.u rShiftedArg = rArg, 44 // Select only necessary bits of arg}{ .mfb andcm rAbsArg = rArg, rSignBit // Remove sign nop.f 0(p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs};; { .mfi and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8 fmerge.s fArgAbs = f1, f8 // |x| shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary // bits of absolute arg}{ .mfi cmp.gt p8, p11 = rArgSgnd, r1625Sgnd // p8 = 1 if // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0 nop.f 0 nop.i 0};;{ .mfi sub rIndex = rShiftedArgMasked, rBias // index << 8 nop.f 0 cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25}{ .mfb(p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0? // (then we should use binary subranges) nop.f 0 (p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25};;.pred.rel "mutex",p8,p11{ .mfi(p8) add rIndex = 0x400, rIndex // Make pointer to binary // subranges(p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0 addl rSaturation = 0x40331, r0 // shifted bits of 19.0625}{ .mfi nop.m 0 (p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0 // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16] nop.i 0 };;{ .mfi add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14 nop.f 0 nop.i 0};;{ .mfi adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs fmerge.s fSignumX = f8, f1 // signum(x) nop.i 0} { .mfb cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625? nop.f 0(p12) br.cond.spnt tanh_saturation // branch out if x |x| >= 19.0625};;{.mfi ldfe fA19 = [rCoeffAddr1], 32 // Load A19 nop.f 0 nop.i 0}{.mfi ldfe fA18 = [rCoeffAddr2], 32 // Load A18 nop.f 0 adds rCoeffAddr3 = 0xA00, rDataPtr // Pointer to "tail" // coefficients tables};;{.mfi ldfe fA17 = [rCoeffAddr1], 32 // Load A17 nop.f 0 nop.i 0}{.mfi ldfe fA16 = [rCoeffAddr2], 32 // Load A16 nop.f 0 nop.i 0};;{.mfi ldfe fA15 = [rCoeffAddr1], 32 // Load A15 fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 // x^2 shr.u rIndex = rIndex, 2 // Index for "tail" tables}{.mfi ldfe fA14 = [rCoeffAddr2], 32 // Load A14 nop.f 0 adds rCoeffAddr4 = 16, r0 // Shifter pointer // to "tail" tables};;{.mfi ldfe fA13 = [rCoeffAddr1], 32 // Load A13 nop.f 0 add rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load // ##16..23}{.mfi ldfe fA12 = [rCoeffAddr2], 32 // Load A12 nop.f 0 cmp.lt p15, p14 = rArg, r0 // Arg positive (p14) // or negative (p15)?};;{.mfi ldfe fA11 = [rCoeffAddr1], 32 // Load A11 nop.f 0 add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail" // coeffs to load }{.mfi ldfe fA10 = [rCoeffAddr2], 32 // Load A10 nop.f 0 nop.i 0};;{.mfi ldfe fA9 = [rCoeffAddr1], 32 // Load A9 nop.f 0 nop.i 0}{.mfi ldfe fA8 = [rCoeffAddr2], 32 // Load A8 nop.f 0 nop.i 0};;{.mfi ldfe fA7 = [rCoeffAddr1], 32 // Load A7 nop.f 0 nop.i 0}{.mfi ldfe fA6 = [rCoeffAddr2], 32 // Load A6 nop.f 0 nop.i 0};;{.mfi ldfe fA5 = [rCoeffAddr1], 32 // Load A5 fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 // x^3 nop.i 0}{.mfi ldfe fA4 = [rCoeffAddr2], 32 // Load A4 fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4 nop.i 0};;// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm{.mfi ldfe fA3 = [rCoeffAddr3], 32 // Load A3 fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x nop.i 0}{.mfi ldfe fA2 = [rCoeffAddr4], 32 // Load A2 nop.f 0 nop.i 0};;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -