📄 s_cos.s
字号:
//// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special// handling.// * If |Arg| < 2^24, go to Step 2 for reduction of moderate// arguments. This is the most likely case.// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large// arguments.// * If |Arg| >= 2^63, go to Step 10 for special handling.//// Step 2. Reduction of moderate arguments.//// If |Arg| < pi/4 ...quick branch// N_fix := N_inc (integer)// r := Arg// c := 0.0// Branch to Step 4, Case_1_complete// Else ...cf. argument reduction// N := Arg * two_by_PI (fp)// N_fix := fcvt.fx( N ) (int)// N := fcvt.xf( N_fix )// N_fix := N_fix + N_inc// s := Arg - N * P_1 (first piece of pi/2)// w := -N * P_2 (second piece of pi/2)//// If |s| >= 2^(-33)// go to Step 3, Case_1_reduce// Else// go to Step 7, Case_2_reduce// Endif// Endif//// Step 3. Case_1_reduce.//// r := s + w// c := (s - r) + w ...observe order//// Step 4. Case_1_complete//// ...At this point, the reduced argument alpha is// ...accurately represented as r + c.// If |r| < 2^(-3), go to Step 6, small_r.//// Step 5. Normal_r.//// Let [i_0 i_1] by the 2 lsb of N_fix.// FR_rsq := r * r// r_hi := frcpa( frcpa( r ) )// r_lo := r - r_hi//// If i_1 = 0, then// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)// correction := c + c*C_1*FR_rsq ...any order// Else// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order// U_lo := QQ_1 * r_lo * (r + r_hi)// correction := -c*(r + S_1*FR_rsq*r) ...any order// Endif//// V := poly + (U_lo + correction) ...observe order//// result := (i_0 == 0? 1.0 : -1.0)//// Last instruction in user-set rounding mode//// result := (i_0 == 0? result*U_hi + V :// result*U_hi - V)//// Return//// Step 6. Small_r.//// ...Use flush to zero mode without causing exception// Let [i_0 i_1] be the two lsb of N_fix.//// FR_rsq := r * r//// If i_1 = 0 then// z := FR_rsq*FR_rsq; z := FR_rsq*z *r// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)// correction := c// result := r// Else// z := FR_rsq*FR_rsq; z := FR_rsq*z// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)// correction := -c*r// result := 1// Endif//// poly := poly_hi + (z * poly_lo + correction)//// If i_0 = 1, result := -result//// Last operation. Perform in user-set rounding mode//// result := (i_0 == 0? result + poly :// result - poly )// Return//// Step 7. Case_2_reduce.//// ...Refer to the write up for argument reduction for// ...rationale. The reduction algorithm below is taken from// ...argument reduction description and integrated this.//// w := N*P_3// U_1 := N*P_2 + w ...FMA// U_2 := (N*P_2 - U_1) + w ...2 FMA// ...U_1 + U_2 is N*(P_2+P_3) accurately//// r := s - U_1// c := ( (s - r) - U_1 ) - U_2//// ...The mathematical sum r + c approximates the reduced// ...argument accurately. Note that although compared to// ...Case 1, this case requires much more work to reduce// ...the argument, the subsequent calculation needed for// ...any of the trigonometric function is very little because// ...|alpha| < 1.01*2^(-33) and thus two terms of the// ...Taylor series expansion suffices.//// If i_1 = 0 then// poly := c + S_1 * r * r * r ...any order// result := r// Else// poly := -2^(-67)// result := 1.0// Endif//// If i_0 = 1, result := -result//// Last operation. Perform in user-set rounding mode//// result := (i_0 == 0? result + poly :// result - poly )//// Return////// Step 8. Pre-reduction of large arguments.//// ...Again, the following reduction procedure was described// ...in the separate write up for argument reduction, which// ...is tightly integrated here.// N_0 := Arg * Inv_P_0// N_0_fix := fcvt.fx( N_0 )// N_0 := fcvt.xf( N_0_fix)// Arg' := Arg - N_0 * P_0// w := N_0 * d_1// N := Arg' * two_by_PI// N_fix := fcvt.fx( N )// N := fcvt.xf( N_fix )// N_fix := N_fix + N_inc//// s := Arg' - N * P_1// w := w - N * P_2//// If |s| >= 2^(-14)// go to Step 3// Else// go to Step 9// Endif//// Step 9. Case_4_reduce.//// ...first obtain N_0*d_1 and -N*P_2 accurately// U_hi := N_0 * d_1 V_hi := -N*P_2// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs//// ...compute the contribution from N_0*d_1 and -N*P_3// w := -N*P_3// w := w + N_0*d_2// t := U_lo + V_lo + w ...any order//// ...at this point, the mathematical value// ...s + U_hi + V_hi + t approximates the true reduced argument// ...accurately. Just need to compute this accurately.//// ...Calculate U_hi + V_hi accurately:// A := U_hi + V_hi// if |U_hi| >= |V_hi| then// a := (U_hi - A) + V_hi// else// a := (V_hi - A) + U_hi// endif// ...order in computing "a" must be observed. This branch is// ...best implemented by predicates.// ...A + a is U_hi + V_hi accurately. Moreover, "a" is// ...much smaller than A: |a| <= (1/2)ulp(A).//// ...Just need to calculate s + A + a + t// C_hi := s + A t := t + a// C_lo := (s - C_hi) + A// C_lo := C_lo + t//// ...Final steps for reduction// r := C_hi + C_lo// c := (C_hi - r) + C_lo//// ...At this point, we have r and c// ...And all we need is a couple of terms of the corresponding// ...Taylor series.//// If i_1 = 0// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)// result := r// Else// poly := FR_rsq*(C_1 + FR_rsq*C_2)// result := 1// Endif//// If i_0 = 1, result := -result//// Last operation. Perform in user-set rounding mode//// result := (i_0 == 0? result + poly :// result - poly )// Return//// Large Arguments: For arguments above 2**63, a Payne-Hanek// style argument reduction is used and pi_by_2 reduce is called.//#ifdef _LIBC.rodata#else.data#endif.align 64FSINCOS_CONSTANTS:ASM_TYPE_DIRECTIVE(FSINCOS_CONSTANTS,@object)data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hidata4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lodata4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14ASM_SIZE_DIRECTIVE(FSINCOS_CONSTANTS)FR_Input_X = f8FR_Neg_Two_to_M3 = f32FR_Two_to_63 = f32FR_Two_to_24 = f33FR_Pi_by_4 = f33FR_Two_to_M14 = f34FR_Two_to_M33 = f35FR_Neg_Two_to_24 = f36FR_Neg_Pi_by_4 = f36FR_Neg_Two_to_M14 = f37FR_Neg_Two_to_M33 = f38FR_Neg_Two_to_M67 = f39FR_Inv_pi_by_2 = f40FR_N_float = f41FR_N_fix = f42FR_P_1 = f43FR_P_2 = f44FR_P_3 = f45FR_s = f46FR_w = f47FR_c = f48FR_r = f49FR_Z = f50FR_A = f51FR_a = f52FR_t = f53FR_U_1 = f54FR_U_2 = f55FR_C_1 = f56FR_C_2 = f57FR_C_3 = f58FR_C_4 = f59FR_C_5 = f60FR_S_1 = f61FR_S_2 = f62FR_S_3 = f63FR_S_4 = f64FR_S_5 = f65FR_poly_hi = f66FR_poly_lo = f67FR_r_hi = f68FR_r_lo = f69FR_rsq = f70FR_r_cubed = f71FR_C_hi = f72FR_N_0 = f73FR_d_1 = f74FR_V = f75FR_V_hi = f75FR_V_lo = f76FR_U_hi = f77FR_U_lo = f78FR_U_hiabs = f79FR_V_hiabs = f80FR_PP_8 = f81FR_QQ_8 = f81FR_PP_7 = f82FR_QQ_7 = f82FR_PP_6 = f83FR_QQ_6 = f83FR_PP_5 = f84FR_QQ_5 = f84FR_PP_4 = f85FR_QQ_4 = f85FR_PP_3 = f86FR_QQ_3 = f86FR_PP_2 = f87FR_QQ_2 = f87FR_QQ_1 = f88FR_N_0_fix = f89FR_Inv_P_0 = f90FR_corr = f91FR_poly = f92FR_d_2 = f93FR_Two_to_M3 = f94FR_Neg_Two_to_63 = f94FR_P_0 = f95FR_C_lo = f96FR_PP_1 = f97FR_PP_1_lo = f98FR_ArgPrime = f99GR_Table_Base = r32GR_Table_Base1 = r33GR_i_0 = r34GR_i_1 = r35GR_N_Inc = r36GR_Sin_or_Cos = r37GR_SAVE_B0 = r39GR_SAVE_GP = r40GR_SAVE_PFS = r41.section .text.proc __libm_sin_double_dbx#.align 64__libm_sin_double_dbx:{ .mlxalloc GR_Table_Base = ar.pfs,0,12,2,0 movl GR_Sin_or_Cos = 0x0 ;;}{ .mmi nop.m 999 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp nop.i 999};;{ .mmi ld8 GR_Table_Base = [GR_Table_Base] nop.m 999 nop.i 999};;{ .mib nop.m 999 nop.i 999 br.cond.sptk L(SINCOS_CONTINUE) ;;}.endp __libm_sin_double_dbx#ASM_SIZE_DIRECTIVE(__libm_sin_double_dbx).section .text.proc __libm_cos_double_dbx#__libm_cos_double_dbx:{ .mlxalloc GR_Table_Base= ar.pfs,0,12,2,0 movl GR_Sin_or_Cos = 0x1 ;;}{ .mmi nop.m 999 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp nop.i 999};;{ .mmi ld8 GR_Table_Base = [GR_Table_Base] nop.m 999 nop.i 999};;//// Load Table Address//L(SINCOS_CONTINUE):{ .mmi add GR_Table_Base1 = 96, GR_Table_Base ldfs FR_Two_to_24 = [GR_Table_Base], 4 nop.i 999};;{ .mmi nop.m 999//// Load 2**24, load 2**63.// ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 mov r41 = ar.pfs ;;}{ .mfi ldfs FR_Two_to_63 = [GR_Table_Base1], 4//// Check for unnormals - unsupported operands. We do not want// to generate denormal exception// Check for NatVals, QNaNs, SNaNs, +/-Infs// Check for EM unsupporteds// Check for Zero// fclass.m.unc p6, p8 = FR_Input_X, 0x1E3 mov r40 = gp ;;}{ .mfi nop.m 999 fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF// GR_Sin_or_Cos denotes mov r39 = b0}{ .mfb ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 fclass.m.unc p10, p0 = FR_Input_X, 0x007(p6) br.cond.spnt L(SINCOS_SPECIAL) ;;}{ .mib nop.m 999 nop.i 999(p8) br.cond.spnt L(SINCOS_SPECIAL) ;;}{ .mib nop.m 999 nop.i 999//// Branch if +/- NaN, Inf.// Load -2**24, load -2**63.//(p10) br.cond.spnt L(SINCOS_ZERO) ;;}{ .mmb ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 nop.b 999 ;;}{ .mmb nop.m 999 ldfe FR_d_1 = [GR_Table_Base1], 16 nop.b 999 ;;}//// Raise possible denormal operand flag with useful fcmp// Is x <= -2**63// Load Inv_P_0 for pre-reduction// Load Inv_pi_by_2//{ .mmb ldfe FR_P_0 = [GR_Table_Base], 16 ldfe FR_d_2 = [GR_Table_Base1], 16 nop.b 999 ;;}//// Load P_0// Load d_1// Is x >= 2**63// Is x <= -2**24?//{ .mmi ldfe FR_P_1 = [GR_Table_Base], 16 ;;//// Load P_1// Load d_2// Is x >= 2**24?// ldfe FR_P_2 = [GR_Table_Base], 16 nop.i 999 ;;}{ .mmf nop.m 999 ldfe FR_P_3 = [GR_Table_Base], 16 fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24}{ .mfi nop.m 999//// Branch if +/- zero.// Decide about the paths to take:// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2// OTHERWISE - CASE 3 OR 4// fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 nop.i 999 ;;}{ .mfi nop.m 999(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 nop.i 999}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -