📄 s_atanl.s
字号:
ASM_SIZE_DIRECTIVE(atanl).text.proc atan2l#.global atan2l##ifdef _LIBC.proc __atan2l#.global __atan2l#.proc __ieee754_atan2l#.global __ieee754_atan2l##endif.align 64 atan2l:#ifdef _LIBC__atan2l:__ieee754_atan2l:#endif{ .mfialloc r32 = ar.pfs, 0, 17 , 4, 0(p0) mov ArgY = ArgY_orig}{ .mfi nop.m 999(p0) mov ArgX = ArgX_orig nop.i 999};;{ .mfi nop.m 999(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103 nop.i 999 }{ .mfi nop.m 999////// Save original input args and load table ptr.//(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103 nop.i 999};;{ .mfi(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF nop.i 999 ;;}{ .mfi ld8 table_ptr1 = [table_ptr1](p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF nop.i 999}{ .mfi nop.m 999(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3 nop.i 999 ;;}{ .mfi(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3 nop.i 999}//// Check for NatVals.// Check for everything - if false, then must be pseudo-zero// or pseudo-nan (IA unsupporteds).//{ .mib nop.m 999 nop.i 999(p6) br.cond.spnt L(ATANL_NATVAL) ;;}{ .mib nop.m 999 nop.i 999(p7) br.cond.spnt L(ATANL_NATVAL) ;;}{ .mib(p0) ldfd P_hi = [table_ptr1],8 nop.i 999(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;;}{ .mbb(p0) add table_ptr2 = 96, table_ptr1(p9) br.cond.spnt L(ATANL_UNSUPPORTED)//// Load double precision high-order part of pi//(p12) br.cond.spnt L(ATANL_NAN) ;;}{ .mfb nop.m 999(p0) fnorm.s1 ArgX = ArgX(p13) br.cond.spnt L(ATANL_NAN) ;;}//// Normalize the input argument.// Branch out if NaN inputs//{ .mmf(p0) ldfs P_lo = [table_ptr1], 4 nop.m 999(p0) fnorm.s1 ArgY = ArgY ;;}{ .mmf nop.m 999(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180//// U = max(ArgX_abs,ArgY_abs)// V = min(ArgX_abs,ArgY_abs)// if PR1, swap = 0// if PR2, swap = 1//(p0) mov M = f1 ;;}{ .mfi nop.m 999//// Get exp and sign of ArgX// Get exp and sign of ArgY// Load 2**(-3) and increment ptr to Q_4.//(p0) fmerge.s ArgX_abs = f1, ArgX nop.i 999 ;;}//// load single precision low-order part of pi = P_lo//{ .mfi(p0) getf.exp sign_X = ArgX(p0) fmerge.s ArgY_abs = f1, ArgY nop.i 999 ;;}{ .mii(p0) getf.exp sign_Y = ArgY nop.i 999 ;;(p0) shr sign_X = sign_X, 17 ;;}{ .mii nop.m 999(p0) shr sign_Y = sign_Y, 17 ;;(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;}{ .mfi nop.m 999//// Is ArgX_abs >= ArgY_abs// Is sign_Y == 0?//(p0) fmax.s1 U = ArgX_abs, ArgY_abs nop.i 999}{ .mfi nop.m 999//// ArgX_abs = |ArgX|// ArgY_abs = |ArgY|// sign_X is sign bit of ArgX// sign_Y is sign bit of ArgY//(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs nop.i 999 ;;}{ .mfi nop.m 999(p0) fmin.s1 V = ArgX_abs, ArgY_abs nop.i 999 ;;}{ .mfi nop.m 999(p8) fadd.s1 s_Y = f0, f1(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X}{ .mii(p6) add swap = r0, r0 nop.i 999 ;;(p7) add swap = 1, r0}{ .mfi nop.m 999//// Let M = 1.0// if p8, s_Y = 1.0// if p9, s_Y = -1.0//(p10) fsub.s1 M = M, f1 nop.i 999 ;;}{ .mfi nop.m 999(p9) fsub.s1 s_Y = f0, f1 nop.i 999 ;;}{ .mfi nop.m 999(p0) frcpa.s1 E, p6 = V, U nop.i 999 ;;}{ .mbb nop.m 999//// E = frcpa(V,U)//(p6) br.cond.sptk L(ATANL_STEP2)(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;}L(ATANL_STEP2): { .mfi nop.m 999(p0) fmpy.s1 Q = E, V nop.i 999}{ .mfi nop.m 999(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig nop.i 999 ;;}{ .mfi nop.m 999//// Is Q < 2**(-3)?//(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig nop.i 999}{ .mfi nop.m 999(p11) fadd.s1 M = M, f1 nop.i 999 ;;}{ .mlx nop.m 999// *************************************************// ********************* STEP2 *********************// *************************************************(p0) movl special = 0x8400000000000000}{ .mlx nop.m 999//// lookup = b_1 b_2 b_3 B_4//(p0) movl special1 = 0x0000000000000100 ;;}{ .mfi nop.m 999//// Do fnorms to raise any denormal operand// exceptions.//(p0) fmpy.s1 P_hi = M, P_hi nop.i 999}{ .mfi nop.m 999(p0) fmpy.s1 P_lo = M, P_lo nop.i 999 ;;}{ .mfi nop.m 999//// Q = E * V//(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3 nop.i 999 ;;}{ .mmb(p0) getf.sig significand_Q = Q(p0) getf.exp exponent_Q = Q nop.b 999 ;;}{ .mmi nop.m 999 ;;(p0) andcm k = 0x0003, exponent_Q(p0) extr.u lookup = significand_Q, 59, 4 ;;}{ .mib nop.m 999(p0) dep special = lookup, special, 59, 4//// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0//(p6) br.cond.spnt L(ATANL_POLY) ;;}{ .mfi(p0) cmp.eq.unc p8, p9 = 0x0000, k(p0) fmpy.s1 P_hi = s_Y, P_hi//// We waited a few extra cycles so P_lo and P_hi could be calculated.// Load the constant 256 for loading up table entries.//// *************************************************// ******************** STEP3 **********************// *************************************************(p0) add table_ptr2 = 16, table_ptr1}//// Let z_hi have exponent and sign of original Q// Load the Tbl_hi(0) else, increment pointer.//{ .mii(p0) ldfe Q_4 = [table_ptr1], -16(p0) xor swap = sign_X, swap ;;(p9) sub k = k, r0, 1}{ .mmi(p0) setf.sig z_hi = special(p0) ldfe Q_3 = [table_ptr1], -16(p9) add table_ptr2 = 16, table_ptr2 ;;}//// U_hold = U - U_prime_hi// k = k * 256 - Result can be 0, 256, or 512.//{ .mmb(p0) ldfe Q_2 = [table_ptr1], -16(p8) ldfd Tbl_hi = [table_ptr2], 8 nop.b 999 ;;}//// U_prime_lo = U_hold + V * z_hi// lookup -> lookup * 16 + k//{ .mmi(p0) ldfe Q_1 = [table_ptr1], -16 ;;(p8) ldfs Tbl_lo = [table_ptr2], 8//// U_prime_hi = U + V * z_hi// Load the Tbl_lo(0)//(p9) pmpy2.r k = k, special1 ;;}{ .mii nop.m 999 nop.i 999 nop.i 999 ;;}{ .mii nop.m 999 nop.i 999 nop.i 999 ;;}{ .mii nop.m 999 nop.i 999 nop.i 999 ;;}{ .mii nop.m 999 nop.i 999 ;;(p9) shladd lookup = lookup, 0x0004, k ;;}{ .mmi(p9) add table_ptr2 = table_ptr2, lookup ;;//// V_prime = V - U * z_hi//(p9) ldfd Tbl_hi = [table_ptr2], 8 nop.i 999 ;;}{ .mmf nop.m 999//// C_hi = frcpa(1,U_prime_hi)//(p9) ldfs Tbl_lo = [table_ptr2], 8//// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0// Point to beginning of Tbl_hi entries - k = 0.//(p0) fmerge.se z_hi = Q, z_hi ;;}{ .mfi nop.m 999(p0) fma.s1 U_prime_hi = V, z_hi, U nop.i 999}{ .mfi nop.m 999(p0) fnma.s1 V_prime = U, z_hi, V nop.i 999 ;;}{ .mfi nop.m 999(p0) mov A_hi = Tbl_hi nop.i 999 ;;}{ .mfi nop.m 999(p0) fsub.s1 U_hold = U, U_prime_hi nop.i 999 ;;}{ .mfi nop.m 999(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi nop.i 999 ;;}{ .mfi(p0) cmp.eq.unc p7, p6 = 0x00000, swap(p0) fmpy.s1 A_hi = s_Y, A_hi nop.i 999 ;;}{ .mfi nop.m 999//// poly = wsq * poly//(p7) fadd.s1 sigma = f0, f1 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 U_prime_lo = z_hi, V, U_hold nop.i 999}{ .mfi nop.m 999(p6) fsub.s1 sigma = f0, f1 nop.i 999 ;;}{ .mfi nop.m 999(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 nop.i 999 ;;}{ .mfi nop.m 999//// A_lo = A_lo + w_hi// A_hi = s_Y * A_hi//(p0) fma.s1 Res_hi = sigma, A_hi, P_hi nop.i 999 ;;}{ .mfi nop.m 999//// C_hi_hold = 1 - C_hi * U_prime_hi (1)//(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi nop.i 999 ;;}{ .mfi nop.m 999//// C_hi = C_hi + C_hi * C_hi_hold (1)//(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 nop.i 999 ;;}{ .mfi nop.m 999//// C_hi_hold = 1 - C_hi * U_prime_hi (2)//(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi nop.i 999 ;;}{ .mfi nop.m 999//// C_hi = C_hi + C_hi * C_hi_hold (2)//(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 nop.i 999 ;;}{ .mfi nop.m 999//// C_hi_hold = 1 - C_hi * U_prime_hi (3)//(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi nop.i 999 ;;}{ .mfi nop.m 999//// C_hi = C_hi + C_hi * C_hi_hold (3)//(p0) fmpy.s1 w_hi = V_prime, C_hi nop.i 999 ;;}{ .mfi nop.m 999//// w_hi = V_prime * C_hi//(p0) fmpy.s1 wsq = w_hi, w_hi nop.i 999}{ .mfi nop.m 999(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime nop.i 999 ;;}{ .mfi nop.m 999//// wsq = w_hi * w_hi// w_lo = = V_prime - w_hi * U_prime_hi//(p0) fma.s1 poly = wsq, Q_4, Q_3 nop.i 999}{ .mfi nop.m 999(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo nop.i 999 ;;}{ .mfi nop.m 999//// poly = Q_3 + wsq * Q_4// w_lo = = w_lo - w_hi * U_prime_lo//(p0) fma.s1 poly = wsq, poly, Q_2 nop.i 999}{ .mfi nop.m 999(p0) fmpy.s1 w_lo = C_hi, w_lo nop.i 999 ;;}{ .mfi nop.m 999//// poly = Q_2 + wsq * poly// w_lo = = w_lo * C_hi//(p0) fma.s1 poly = wsq, poly, Q_1 nop.i 999}{ .mfi nop.m 999(p0) fadd.s1 A_lo = Tbl_lo, w_lo nop.i 999 ;;}{ .mfi nop.m 999//// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)//(p0) fmpy.s0 Q_1 = Q_1, Q_1 nop.i 999 ;;}{ .mfi nop.m 999//// poly = Q_1 + wsq * poly// A_lo = Tbl_lo + w_lo// swap = xor(swap,sign_X)//(p0) fmpy.s1 poly = wsq, poly nop.i 999 ;;}{ .mfi nop.m 999//// Is (swap) != 0 ?// poly = wsq * poly// A_hi = Tbl_hi//(p0) fmpy.s1 poly = w_hi, poly nop.i 999 ;;}{ .mfi nop.m 999//// if (PR_1) sigma = -1.0// if (PR_2) sigma = 1.0//(p0) fadd.s1 A_lo = A_lo, poly nop.i 999 ;;}{ .mfi nop.m 999//// P_hi = s_Y * P_hi// A_lo = A_lo + poly//(p0) fadd.s1 A_lo = A_lo, w_hi nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 Res_lo = sigma, A_lo, P_lo nop.i 999 ;;}{ .mfb nop.m 999//// Res_hi = P_hi + sigma * A_hi// Res_lo = P_lo + sigma * A_lo//(p0) fma.s0 Result = Res_lo, s_Y, Res_hi//// Raise inexact.//br.ret.sptk b0 ;;}//// poly1 = P_5 + zsq * poly1// poly2 = zsq * poly2//L(ATANL_POLY): { .mmf(p0) xor swap = sign_X, swap nop.m 999(p0) fnma.s1 E_hold = E, U, f1 ;;}{ .mfi nop.m 999(p0) mov A_temp = Q//// poly1 = P_4 + zsq * poly1// swap = xor(swap,sign_X)//// sign_X gr_002// swap gr_004// poly1 = poly1 <== Done with poly1// poly1 = P_4 + zsq * poly1// swap = xor(swap,sign_X)//(p0) cmp.eq.unc p7, p6 = 0x00000, swap}{ .mfi nop.m 999(p0) fmpy.s1 P_hi = s_Y, P_hi nop.i 999 ;;}{ .mfi nop.m 999(p6) fsub.s1 sigma = f0, f1 nop.i 999}{ .mfi nop.m 999(p7) fadd.s1 sigma = f0, f1 nop.i 999 ;;}// ***********************************************// ******************** STEP4 ********************// ***********************************************{ .mmi nop.m 999(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp nop.i 999};;{ .mmi ld8 table_ptr1 = [table_ptr1] nop.m 999 nop.i 999};;{ .mfi nop.m 999(p0) fma.s1 E = E, E_hold, E//// Following:// Iterate 3 times E = E + E*(1.0 - E*U)// Also load P_8, P_7, P_6, P_5, P_4// E_hold = 1.0 - E * U (1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -