📄 e_powl.s
字号:
//// Point to Table of W1s// Point to Table of W2s//{ .mmi add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1 add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2 cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp};;// Form two constants we need// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand{ .mfi setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 nop.f 999 and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1}{ .mlx setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift};;{ .mfi nop.m 999 fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo cmp.eq p15, p0= r0, r0 // Set p15, assume safe};;{ .mmi setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for // EXPL_SMALL path};;{ .mmi ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path;; ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path nop.i 999};;{ .mfi ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi nop.i 999};;{ .mmi ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path;; ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path nop.i 999};;// N = X * Inv_log2_by_2^12// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.{ .mfi ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51 nop.i 999}{ .mfb nop.m 999 fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X(p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50};;{ .mmi getf.exp GR_Expo_X = FR_X add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1 add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2};;// float_N = round_int(N)// The signficand of N contains the rounded integer part of X * 2^12/ln2,// as a twos complement number in the lower bits (that is, it may be negative).// That twos complement number (called N) is put into GR_N_fix.// Since N is scaled by 2^51, it must be multiplied by 2^-51// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.// Thus, float_N contains the floating point version of N{ .mfi add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N nop.i 999}// Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo{ .mfi mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent};;{ .mfi nop.m 999 fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path mov GR_vsm_expo = -70 // Exponent for very small path}{ .mfi nop.m 999 fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path add GR_temp = 0x1,r0 // For tiny signif if small path};;//// If expo_X < -6 goto exp_small//{ .mmi getf.sig GR_N_fix = FR_N ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3 and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X};;{ .mfi ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2 nop.f 999 sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X};;//// If -6 > Expo_X, set P9 and branch//{ .mfb cmp.gt p9, p0 = -6, GR_Expo_X fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N(p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6};;//// If 14 <= Expo_X, set P10//{ .mib cmp.le p10, p0 = 14, GR_Expo_X nop.i 999(p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14};;//// Load single T1// Load single T2// W_1_p1 = W_1 + 1//{ .mmi nop.m 999 nop.m 999 extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1};;//// k = extr.u(N_fix,0,6)//{ .mmi shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1 shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1 extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2};;// N_fix is only correct up to 50 bits because of our right shift technique.// Actually in the normal path we will have restricted K to about 14 bits.// Somewhat arbitrarily we extract 32 bits.{ .mmi ldfd FR_W1 = [GR_W1_ptr] shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2 extr GR_k = GR_N_fix, 12, 32 // Extract k};;{ .mfi ldfs FR_T1 = [GR_T1_ptr] fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2}{ .mfi add GR_exp_bias_p_k = GR_exp_bias, GR_k nop.f 999 cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp};;//// if k < big_neg_exp, set p14 and Safe=False//{ .mmi ldfs FR_T2 = [GR_T2_ptr](p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp nop.i 999};;{ .mmi setf.exp FR_Scale = GR_exp_bias_p_k ldfd FR_W2 = [GR_W2_ptr] nop.i 999};;{ .mfi ldfe FR_A_1 = [GR_Table_Ptr],16 fadd.s1 FR_r = FR_r, FR_X_cor nop.i 999};;{ .mfi nop.m 999 fadd.s1 FR_W_1_p1 = FR_W1, f1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 nop.i 999}{ .mfi nop.m 999 fmpy.s1 FR_rsq = FR_r, FR_r nop.i 999};;{ .mfi nop.m 999 fmpy.s1 FR_T = FR_T1, FR_T2 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T nop.i 999};;{ .mfi nop.m 999 fadd.s1 FR_Wp1 = FR_W, f1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_poly = FR_rsq, FR_poly,FR_r nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T nop.i 999}{ .mfi nop.m 999 fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W nop.i 999};;{ .mfb nop.m 999 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale br.cond.sptk POWL_64_SHARED};;EXPL_SMALL:// Here if |ylogx| < 2^-6//// Begin creating lsb to perturb final result//{ .mfi setf.sig FR_temp = GR_temp fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4 cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70}{ .mfi nop.m 999 fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1 nop.i 999};;{ .mfi nop.m 999 fmpy.s1 FR_TMP2 = f1, f1 nop.i 999}{ .mfi nop.m 999 fmpy.s1 FR_TMP1 = FR_Sgn, f1 nop.i 999};;{ .mfi nop.m 999 fmpy.s1 FR_r4 = FR_rsq, FR_rsq(p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70}{ .mfb nop.m 999(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X(p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70};;{ .mfi nop.m 999 fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3 nop.i 999}{ .mfi nop.m 999 fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi nop.i 999};;{ .mfi nop.m 999 fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info nop.i 999};;//// Toggle on last bit of Y_lo// Set lsb of Y_lo to 1//{ .mfi nop.m 999 for FR_temp = FR_Y_lo,FR_temp nop.i 999};;{ .mfb nop.m 999 fmerge.se FR_TMP3 = FR_TMP3,FR_temp br.cond.sptk POWL_64_SHARED};;EXPL_HUGE:// Here if |ylogx| >= 2^14{ .mfi mov GR_temp = 0x0A1DC // If X < 0, exponent -24100 fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0 cmp.eq p14, p15 = r0, r0 // Set Safe to false};;{ .mmi(p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000(p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000 nop.i 999};;{ .mmf setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2(p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100(p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0};;{ .mfi nop.m 999 fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn nop.i 999};;{ .mfb nop.m 999 fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn) br.cond.sptk POWL_64_SHARED};;POWL_Y_ALMOST_1:// Here if delta = |y-1| < 2^-50//// x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta)//// Computation will be safe for 2^-16381 <= x < 2^16383{ .mfi mov GR_exp_ynear1_oflow = 0xffff + 16383 fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0 and GR_exp_x = GR_exp_mask, GR_signexp_x};;{ .mfi cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo mov GR_exp_ynear1_uflow = 0xffff - 16381};;{ .mfb(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow fma.s1 FR_TMP3 = FR_Input_X,f1,f0 br.cond.sptk POWL_64_SHARED};;POWL_64_SQUARE://// Here if x not zero and y=2.//// Setup for multipath code//{ .mfi mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x};;{ .mfi cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows};;{ .mfi(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe fma.s1 FR_TMP3 = f0,f0,f0 nop.i 999};;//// This is the shared path that will set overflow and underflow.//POWL_64_SHARED://// Return if no danger of over or underflow.//{ .mfb nop.m 999 fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3(p15) br.ret.sptk b0 // Main path return if certain no over/underflow};;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -