📄 libm_sincosl.s
字号:
// A := U_hi + V_hi// if |U_hi| >= |V_hi| then// a := (U_hi - A) + V_hi// else// a := (V_hi - A) + U_hi// endif// ...order in computing "a" must be observed. This branch is// ...best implemented by predicates.// ...A + a is U_hi + V_hi accurately. Moreover, "a" is// ...much smaller than A: |a| <= (1/2)ulp(A).//// ...Just need to calculate s + A + a + t// C_hi := s + A t := t + a// C_lo := (s - C_hi) + A// C_lo := C_lo + t//// ...Final steps for reduction// r := C_hi + C_lo// c := (C_hi - r) + C_lo//// ...At this point, we have r and c// ...And all we need is a couple of terms of the corresponding// ...Taylor series.//// If i_1 = 0// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)// result := r// Else// poly := FR_rsq*(C_1 + FR_rsq*C_2)// result := 1// Endif//// If i_0 = 1, result := -result//// Last operation. Perform in user-set rounding mode//// result := (i_0 == 0? result + poly :// result - poly )// Return//// Large Arguments: For arguments above 2**63, a Payne-Hanek// style argument reduction is used and pi_by_2 reduce is called.//RODATA.align 64LOCAL_OBJECT_START(FSINCOSL_CONSTANTS)sincosl_table_p://data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2//data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0//data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1//data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2//data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3//data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1//data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0data8 0xC90FDAA22168C235, 0x00003FFF // P_1data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2LOCAL_OBJECT_END(FSINCOSL_CONSTANTS)LOCAL_OBJECT_START(sincosl_table_d)//data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4//data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0data4 0x3E000000, 0xBE000000 // 2^-3 and -2^-3data4 0x2F000000, 0xAF000000 // 2^-33 and -2^-33data4 0x9E000000, 0x00000000 // -2^-67data4 0x00000000, 0x00000000 // padLOCAL_OBJECT_END(sincosl_table_d)LOCAL_OBJECT_START(sincosl_table_pp)//data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8//data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7//data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6//data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5//data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1//data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi//data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4//data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3//data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2//data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lodata8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7data8 0xB092382F640AD517, 0x00003FDE // PP_6data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hidata8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3data8 0x8888888888888962, 0x00003FF8 // PP_2data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_loLOCAL_OBJECT_END(sincosl_table_pp)LOCAL_OBJECT_START(sincosl_table_qq)//data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2 // QQ_8//data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA // QQ_7//data4 0x9C716658, 0x8F76C650, 0x00003FE2 // QQ_6//data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9 // QQ_5//data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC // S_1//data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1//data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4//data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3//data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1data8 0x8000000000000000, 0x0000BFFE // QQ_1data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2LOCAL_OBJECT_END(sincosl_table_qq)LOCAL_OBJECT_START(sincosl_table_c)//data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1//data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2//data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3//data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4//data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2data8 0xB60B60B60356F994, 0x0000BFF5 // C_3data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5LOCAL_OBJECT_END(sincosl_table_c)LOCAL_OBJECT_START(sincosl_table_s)//data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1//data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2//data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3//data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4//data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1data8 0x88888888888868DB, 0x00003FF8 // S_2data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5data4 0x38800000, 0xB8800000 // two**-14 and -two**-14LOCAL_OBJECT_END(sincosl_table_s)FR_Input_X = f8FR_Result = f8FR_ResultS = f9FR_ResultC = f8FR_r = f8FR_c = f9FR_norm_x = f9FR_inv_pi_2to63 = f10FR_rshf_2to64 = f11FR_2tom64 = f12FR_rshf = f13FR_N_float_signif = f14FR_abs_x = f15FR_r6 = f32FR_r7 = f33FR_Pi_by_4 = f34FR_Two_to_M14 = f35FR_Neg_Two_to_M14 = f36FR_Two_to_M33 = f37FR_Neg_Two_to_M33 = f38FR_Neg_Two_to_M67 = f39FR_Inv_pi_by_2 = f40FR_N_float = f41FR_N_fix = f42FR_P_1 = f43FR_P_2 = f44FR_P_3 = f45FR_s = f46FR_w = f47FR_Z = f50FR_A = f51FR_a = f52FR_t = f53FR_U_1 = f54FR_U_2 = f55FR_C_1 = f56FR_C_2 = f57FR_C_3 = f58FR_C_4 = f59FR_C_5 = f60FR_S_1 = f61FR_S_2 = f62FR_S_3 = f63FR_S_4 = f64FR_S_5 = f65FR_r_hi = f68FR_r_lo = f69FR_rsq = f70FR_r_cubed = f71FR_C_hi = f72FR_N_0 = f73FR_d_1 = f74FR_V_hi = f75FR_V_lo = f76FR_U_hi = f77FR_U_lo = f78FR_U_hiabs = f79FR_V_hiabs = f80FR_PP_8 = f81FR_QQ_8 = f101FR_PP_7 = f82FR_QQ_7 = f102FR_PP_6 = f83FR_QQ_6 = f103FR_PP_5 = f84FR_QQ_5 = f104FR_PP_4 = f85FR_QQ_4 = f105FR_PP_3 = f86FR_QQ_3 = f106FR_PP_2 = f87FR_QQ_2 = f107FR_QQ_1 = f108FR_r_hi_sq = f88FR_N_0_fix = f89FR_Inv_P_0 = f90FR_d_2 = f93FR_P_0 = f95FR_C_lo = f96FR_PP_1 = f97FR_PP_1_lo = f98FR_ArgPrime = f99FR_inexact = f100FR_Neg_Two_to_M3 = f109FR_Two_to_M3 = f110FR_poly_hiS = f66FR_poly_hiC = f112FR_poly_loS = f67FR_poly_loC = f113FR_polyS = f92FR_polyC = f114FR_cS = FR_cFR_cC = f115FR_corrS = f91FR_corrC = f116FR_U_hiC = f117FR_U_loC = f118FR_VS = f75FR_VC = f119FR_FirstS = f120FR_FirstC = f121FR_U_hiS = FR_U_hiFR_U_loS = FR_U_loFR_Tmp = f94sincos_pResSin = r34sincos_pResCos = r35GR_exp_m2_to_m3= r36GR_N_Inc = r37GR_Cis = r38GR_signexp_x = r40GR_exp_x = r40GR_exp_mask = r41GR_exp_2_to_63 = r42GR_exp_2_to_m3 = r43GR_exp_2_to_24 = r44GR_N_SignS = r45GR_N_SignC = r46GR_N_SinCos = r47GR_sig_inv_pi = r48GR_rshf_2to64 = r49GR_exp_2tom64 = r50GR_rshf = r51GR_ad_p = r52GR_ad_d = r53GR_ad_pp = r54GR_ad_qq = r55GR_ad_c = r56GR_ad_s = r57GR_ad_ce = r58GR_ad_se = r59GR_ad_m14 = r60GR_ad_s1 = r61// For unwind supportGR_SAVE_B0 = r39GR_SAVE_GP = r40GR_SAVE_PFS = r41.section .textGLOBAL_IEEE754_ENTRY(sincosl){ .mlx ///////////////////////////// 1 ///////////////// alloc r32 = ar.pfs,3,27,2,0 movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi}{ .mlx mov GR_N_Inc = 0x0 movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)};;{ .mfi ///////////////////////////// 2 ///////////////// addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3}{ .mfb mov GR_Cis = 0x0 fnorm.s1 FR_norm_x = FR_Input_X // Normalize x br.cond.sptk _COMMON_SINCOSL};;GLOBAL_IEEE754_END(sincosl)GLOBAL_LIBM_ENTRY(__libm_sincosl){ .mlx ///////////////////////////// 1 ///////////////// alloc r32 = ar.pfs,3,27,2,0 movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi}{ .mlx mov GR_N_Inc = 0x0 movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)};;{ .mfi ///////////////////////////// 2 ///////////////// addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3}{ .mfb mov GR_Cis = 0x1 fnorm.s1 FR_norm_x = FR_Input_X // Normalize x nop.b 0};;_COMMON_SINCOSL:{ .mfi ///////////////////////////// 3 ///////////////// setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63 nop.f 0 mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N}{ .mlx setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64) movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63};;{ .mfi ///////////////////////////// 4 ///////////////// ld8 GR_ad_p = [GR_ad_p] // Point to Inv_pi_by_2 fclass.m p7, p0 = FR_Input_X, 0x0b // Test x denormal nop.i 0};;{ .mfi ///////////////////////////// 5 ///////////////// getf.exp GR_signexp_x = FR_Input_X // Get sign and exponent of x fclass.m p10, p0 = FR_Input_X, 0x007 // Test x zero nop.i 0}{ .mib mov GR_exp_mask = 0x1ffff // Exponent mask nop.i 0(p6) br.cond.spnt SINCOSL_SPECIAL // Branch if x natval, nan, inf};;{ .mfi ///////////////////////////// 6 ///////////////// setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float nop.f 0 add GR_ad_d = 0x70, GR_ad_p // Point to constant table d}{ .mib setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63 mov GR_exp_m2_to_m3 = 0x2fffc // Form -(2^-3)(p7) br.cond.spnt SINCOSL_DENORMAL // Branch if x denormal};;SINCOSL_COMMON2:{ .mfi ///////////////////////////// 7 ///////////////// and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test x unsupported type mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63}{ .mib add GR_ad_pp = 0x40, GR_ad_d // Point to constant table pp mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24(p10) br.cond.spnt SINCOSL_ZERO // Branch if x zero};;{ .mfi ///////////////////////////// 8 ///////////////// ldfe FR_Inv_pi_by_2 = [GR_ad_p], 16 // Load 2/pi fcmp.eq.s0 p15, p0 = FR_Input_X, f0 // Dummy to set denormal add GR_ad_qq = 0xa0, GR_ad_pp // Point to constant table qq}{ .mfi ldfe FR_Pi_by_4 = [GR_ad_d], 16 // Load pi/4 for range test nop.f 0 cmp.ge p10,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63};;{ .mfi ///////////////////////////// 9 ///////////////// ldfe FR_P_0 = [GR_ad_p], 16 // Load P_0 for pi/4 <= |x| < 2^63 fmerge.s FR_abs_x = f1, FR_norm_x // |x| add GR_ad_c = 0x90, GR_ad_qq // Point to constant table c}{ .mfi ldfe FR_Inv_P_0 = [GR_ad_d], 16 // Load 1/P_0 for pi/4 <= |x| < 2^63 nop.f 0 cmp.ge p7,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24};;{ .mfi ///////////////////////////// 10 ///////////////// ldfe FR_P_1 = [GR_ad_p], 16 // Load P_1 for pi/4 <= |x| < 2^63 nop.f 0 add GR_ad_s = 0x50, GR_ad_c // Point to constant table s}{ .mfi ldfe FR_PP_8 = [GR_ad_pp], 16 // Load PP_8 for 2^-3 < |r| < pi/4 nop.f 0 nop.i 0};;{ .mfi ///////////////////////////// 11 ///////////////// ldfe FR_P_2 = [GR_ad_p], 16 // Load P_2 for pi/4 <= |x| < 2^63 nop.f 0 add GR_ad_ce = 0x40, GR_ad_c // Point to end of constant table c}{ .mfi ldfe FR_QQ_8 = [GR_ad_qq], 16 // Load QQ_8 for 2^-3 < |r| < pi/4 nop.f 0 nop.i 0};;{ .mfi ///////////////////////////// 12 ///////////////// ldfe FR_QQ_7 = [GR_ad_qq], 16 // Load QQ_7 for 2^-3 < |r| < pi/4 fma.s1 FR_N_float_signif = FR_Input_X, FR_inv_pi_2to63, FR_rshf_2to64 add GR_ad_se = 0x40, GR_ad_s // Point to end of constant table s}{ .mib ldfe FR_PP_7 = [GR_ad_pp], 16 // Load PP_7 for 2^-3 < |r| < pi/4 mov GR_ad_s1 = GR_ad_s // Save pointer to S_1(p10) br.cond.spnt SINCOSL_ARG_TOO_LARGE // Branch if |x| >= 2^63 // Use Payne-Hanek Reduction};;{ .mfi ///////////////////////////// 13 ///////////////// ldfe FR_P_3 = [GR_ad_p], 16 // Load P_3 for pi/4 <= |x| < 2^63 fmerge.se FR_r = FR_norm_x, FR_norm_x // r = x, in case |x| < pi/4 add GR_ad_m14 = 0x50, GR_ad_s // Point to constant table m14}{ .mfb ldfps FR_Two_to_M3, FR_Neg_Two_to_M3 = [GR_ad_d], 8 fma.s1 FR_rsq = FR_norm_x, FR_norm_x, f0 // rsq = x*x, in case |x| < pi/4(p7) br.cond.spnt SINCOSL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63 // Use pre-reduction};;{ .mmf ///////////////////////////// 14 ///////////////// ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 for normal path ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 for normal path fmerge.se FR_c = f0, f0 // c = 0 in case |x| < pi/4};;{ .mmf ///////////////////////////// 15 ///////////////// ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 for normal path ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 for normal path nop.f 0};;// Here if 0 < |x| < 2^24{ .mfi ///////////////////////////// 17 ///////////////// ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0 fcmp.lt.s1 p6, p7 = FR_abs_x, FR_Pi_by_4 // Test |x| < pi/4 nop.i 0}{ .mfi ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1 fms.s1 FR_N_float = FR_N_float_signif, FR_2tom64, FR_rshf nop.i 0};;{ .mmi ///////////////////////////// 18 ///////////////// ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0 ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1 nop.i 0};;//// N = Arg * 2/pi// Check if Arg < pi/4//
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -