📄 e_acoshl.s
字号:
FR_R1 = f88FR_X_Hi = f89FR_X_lo = f90FR_HH = f91FR_LL = f92FR_HL = f93FR_LH = f94 // Error handler registersFR_Arg_X = f95FR_Arg_Y = f0// General Purpose Registers // General prolog registersGR_PFS = r32GR_OneP125 = r33GR_TwoP63 = r34GR_Arg = r35GR_Half = r36 // Near 1 path registersGR_Poly_P = r37GR_Poly_Q = r38 // Special logl registersGR_Index1 = r39 GR_Index2 = r40 GR_signif = r41 GR_X_0 = r42 GR_X_1 = r43 GR_X_2 = r44 GR_minus_N = r45GR_Z_1 = r46 GR_Z_2 = r47 GR_N = r48 GR_Bias = r49 GR_M = r50 GR_Index3 = r51 GR_exp_2tom80 = r52 GR_exp_mask = r53 GR_exp_2tom7 = r54 GR_ad_ln10 = r55 GR_ad_tbl_1 = r56GR_ad_tbl_2 = r57GR_ad_tbl_3 = r58GR_ad_q = r59GR_ad_z_1 = r60GR_ad_z_2 = r61GR_ad_z_3 = r62//// Added for unwind support//GR_SAVE_PFS = r32GR_SAVE_B0 = r33GR_SAVE_GP = r34GR_Parameter_X = r64GR_Parameter_Y = r65GR_Parameter_RESULT = r66GR_Parameter_TAG = r67.section .textGLOBAL_LIBM_ENTRY(acoshl){ .mfi alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1 mov GR_Half = 0xfffe // 0.5's exp}{ .mfi addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2 addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table};; { .mfi getf.d GR_Arg = FR_Arg // get arument as double (int64) fma.s0 FR_Two = f1, f1, f1 // construct 2.0 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables}{ .mlx nop.m 0 movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)};; { .mfi ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0) nop.i 0}{ .mlx ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)};;{ .mfi ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')}{ .mfb cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)(p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)};; { .mmi setf.exp FR_Half = GR_Half // construct 0.5(p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path) mov GR_exp_mask = 0x1FFFF // Create exponent mask};; { .mmf (p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5(p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5 fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1};;{ .mfi (p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4 fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of // m2 = fma(X*X - m2) add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1}{ .mfb(p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4 (p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)(p7) br.ret.spnt b0 // return (Nan, Inf)};; { .mfi(p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3 nop.f 0 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P}{ .mfb(p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1(p9) br.cond.spnt huges_logl // special version of log};; { .mfi (p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2(p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2}{ .mfb(p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2(p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)(p10) br.ret.spnt b0 // return (arg = 1) };; { .mmi (p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1(p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2};;{ .mfi (p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0 fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3}{ .mfb(p8) ldfe FR_QQ0 = [GR_Poly_Q] nop.f 0(p8) br.cond.spnt near_1 // near 1 path};; { .mfi ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi nop.f 0 mov GR_Bias = 0x0FFFF // Create exponent bias};;{ .mfi nop.m 0 frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr. nop.i 0};; { .mfi ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp nop.i 0};;{ .mfi ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2 // 8 bit Newton Raphson iteration nop.i 0}{ .mfi nop.m 0 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp nop.i 0};;{ .mfi ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h nop.i 0}{ .mfi nop.m 0 fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l nop.i 0};;{ .mfi ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g // 16 bit Newton Raphson iteration nop.i 0}{ .mfi nop.m 0 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h nop.i 0};;{ .mfi ldfe FR_Q1 = [GR_ad_q] // Load Q1 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g // 32 bit Newton Raphson iteration nop.i 0}{ .mfi nop.m 0 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h nop.i 0};;{ .mfi nop.m 0 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g // 64 bit Newton Raphson iteration nop.i 0}{ .mfi nop.m 0 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h nop.i 0};;{ .mfi nop.m 0 fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l nop.i 0};;{ .mfi getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 nop.f 0 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7};;{ .mfi nop.m 0 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif}{ .mfi nop.m 0 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl nop.i 0};;{ .mmi shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.};;{ .mmi ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 nop.m 0 nop.i 0};;{ .mmi ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 nop.m 0 nop.i 0};;{ .mfi nop.m 0 fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1};;// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!// "DEAD" ZONE!{ .mfi nop.m 0 nop.f 0 nop.i 0};;{ .mfi nop.m 0 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1| nop.i 0};;{ .mmi getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1 ldfd FR_h = [GR_ad_tbl_1] // Load h_1 nop.i 0};;{ .mfi nop.m 0 nop.f 0 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 };;{ .mfi shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80}{ .mfi shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 nop.f 0 sub GR_N = GR_N, GR_Bias // sub bias from exp};;{ .mmi ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)};;{ .mmi ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 nop.m 0 nop.i 0};;{ .mmi setf.sig FR_float_N = GR_N // Put integer N into rightmost sign setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2};;// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!) // BECAUSE OF POSSIBLE 10 CLOCKS STALL!// (Just nops added - nothing to do here){ .mfi nop.m 0 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl nop.i 0};;{ .mfi nop.m 0 nop.f 0 nop.i 0};;{ .mfi nop.m 0 nop.f 0 nop.i 0};;{ .mfi nop.m 0 nop.f 0 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2};;{ .mfi shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 nop.f 0 nop.i 0};;{ .mfi ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 nop.f 0 nop.i 0};;{ .mfi ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 fcvt.xf FR_float_N = FR_float_N nop.i 0};;{ .mfi nop.m 0 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 nop.i 0}{ .mfi nop.m 0 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 nop.i 0};;{ .mfi nop.m 0 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N) nop.i 0};;{ .mfi nop.m 0 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 nop.i 0}{ .mfi nop.m 0 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 nop.i 0};;{ .mfi nop.m 0 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 nop.i 0};;{ .mfi nop.m 0 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h nop.i 0}{ .mfi nop.m 0 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1) nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 nop.i 0}{ .mfi nop.m 0 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h nop.i 0};;{ .mfi nop.m 0 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo nop.i 0};;{ .mfb nop.m 0 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi br.ret.sptk b0 // Common exit for 2^-7 < x < inf};;huges_logl:{ .mmi getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 nop.i 0};;{ .mfi add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 nop.f 0 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P}{ .mfi add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 nop.f 0 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2};;{ .mfi add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 nop.f 0 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif};;{ .mfi shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 nop.f 0 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.};;{ .mfi ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 nop.f 0 mov GR_exp_mask = 0x1FFFF // Create exponent mask}{ .mfi shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 nop.f 0 mov GR_Bias = 0x0FFFF // Create exponent bias
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -