📄 s_tanl.s
字号:
nop.i 999};;// Case 2_reduce: U_1 = N * P_2 + w{ .mfi nop.m 999 fma.s1 U_1 = N, P_2, w2 // U_1 = N * P_2 + w for |s| < 2^-33 nop.i 999};;//// Decide between case_1 and case_2 reduce:// Case 1_reduce: |s| >= 2**(-33)// Case 2_reduce: |s| < 2**(-33)//{ .mfi nop.m 999 fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33 nop.i 999};;{ .mfi nop.m 999(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 nop.i 999};;// Case 1_reduce: c = s - r{ .mfi nop.m 999 fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-33 nop.i 999};;// Case 2_reduce: r is complete here - continue to calculate c .// r = s - U_1{ .mfi nop.m 999(p9) fsub.s1 r = s_val, U_1 nop.i 999}{ .mfi nop.m 999(p9) fms.s1 U_2 = N, P_2, U_1 nop.i 999};;//// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10// else set PR_13.//{ .mfi nop.m 999 fand B = B_mask1, r nop.i 999}{ .mfi nop.m 999(p8) fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2 nop.i 999};;{ .mfi(p8) getf.sig sig_r = r // Get signif of r if |s| >= 2^-33 nop.f 999 nop.i 999};;{ .mfi(p8) getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2 nop.i 999};;// Case 1_reduce: c is complete here.// Case 1: Branch to SMALL_R or NORMAL_R.// c = c + w (w has not been negated.){ .mfi nop.m 999(p8) fsub.s1 c = c, w // c = c - w for |s| >= 2^-33 nop.i 999}{ .mbb nop.m 999(p10) br.cond.spnt TANL_SMALL_R // Branch if pi/4 < |x| < 2^24 and |r|<1/4(p13) br.cond.sptk TANL_NORMAL_R_A // Branch if pi/4 < |x| < 2^24 and |r|>=1/4};;// Here if pi/4 < |x| < 2^24 and |s| < 2^-33//// Is i_1 = lsb of N_fix_gr even or odd?// if i_1 == 0, set p11, else set p12.//{ .mfi nop.m 999 fsub.s1 s_val = s_val, r add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)}{ .mfi nop.m 999//// Case 2_reduce:// U_2 = N * P_2 - U_1// Not needed until later.// fadd.s1 U_2 = U_2, w2//// Case 2_reduce:// s = s - r// U_2 = U_2 + w// nop.i 999};;//// Case 2_reduce:// c = c - U_2// c is complete here// Argument reduction ends here.//{ .mfi nop.m 999 fmpy.s1 rsq = r, r tbit.z p11, p12 = N_fix_gr, 0 ;; // Set p11 if N even, p12 if odd}{ .mfi nop.m 999(p12) frcpa.s1 S_hi,p0 = f1, r nop.i 999}{ .mfi nop.m 999 fsub.s1 c = s_val, U_1 nop.i 999};;{ .mmi add table_ptr1 = 160, table_base ;; // Point to tanl_table_p1 ldfe P1_1 = [table_ptr1],144 nop.i 999 ;;}//// Load P1_1 and point to Q1_1 .//{ .mfi ldfe Q1_1 = [table_ptr1]//// N even: rsq = r * Z// N odd: S_hi = frcpa(r)//(p12) fmerge.ns S_hi = S_hi, S_hi nop.i 999}{ .mfi nop.m 999//// Case 2_reduce:// c = s - U_1//(p9) fsub.s1 c = c, U_2 nop.i 999 ;;}{ .mfi nop.m 999(p12) fma.s1 poly1 = S_hi, r, f1 nop.i 999 ;;}{ .mfi nop.m 999//// N odd: Change sign of S_hi//(p11) fmpy.s1 rsq = rsq, P1_1 nop.i 999 ;;}{ .mfi nop.m 999(p12) fma.s1 S_hi = S_hi, poly1, S_hi nop.i 999 ;;}{ .mfi nop.m 999//// N even: rsq = rsq * P1_1// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary//(p11) fma.s1 Poly = r, rsq, c nop.i 999 ;;}{ .mfi nop.m 999//// N even: Poly = c + r * rsq// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary//(p12) fma.s1 poly1 = S_hi, r, f1(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl}{ .mfi nop.m 999//// N even: Result = Poly + r// N odd: poly1 = 1.0 + S_hi * r 32 bits partial//(p14) fadd.s0 Result = r, Poly // for tanl nop.i 999}{ .mfi nop.m 999(p15) fms.s0 Result = r, mOne, Poly // for cotl nop.i 999};;{ .mfi nop.m 999(p12) fma.s1 S_hi = S_hi, poly1, S_hi nop.i 999 ;;}{ .mfi nop.m 999//// N even: Result1 = Result + r// N odd: S_hi = S_hi * poly1 + S_hi 32 bits//(p12) fma.s1 poly1 = S_hi, r, f1 nop.i 999 ;;}{ .mfi nop.m 999//// N odd: poly1 = S_hi * r + 1.0 64 bits partial//(p12) fma.s1 S_hi = S_hi, poly1, S_hi nop.i 999 ;;}{ .mfi nop.m 999//// N odd: poly1 = S_hi * poly + 1.0 64 bits//(p12) fma.s1 poly1 = S_hi, r, f1 nop.i 999 ;;}{ .mfi nop.m 999//// N odd: poly1 = S_hi * r + 1.0//(p12) fma.s1 poly1 = S_hi, c, poly1 nop.i 999 ;;}{ .mfi nop.m 999//// N odd: poly1 = S_hi * c + poly1//(p12) fmpy.s1 S_lo = S_hi, poly1 nop.i 999 ;;}{ .mfi nop.m 999//// N odd: S_lo = S_hi * poly1//(p12) fma.s1 S_lo = Q1_1, r, S_lo(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl}{ .mfi nop.m 999//// N odd: Result = S_hi + S_lo// fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact nop.i 999 ;;}{ .mfi nop.m 999//// N odd: S_lo = S_lo + Q1_1 * r//(p14) fadd.s0 Result = S_hi, S_lo // for tanl nop.i 999}{ .mfb nop.m 999(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl br.ret.sptk b0 ;; // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33}TANL_LARGER_ARG:// Here if 2^24 <= |x| < 2^63//// ARGUMENT REDUCTION CODE - CASE 3 and 4//{ .mmf mov GR_exp_2tom14 = 0xffff - 14 // Form signexp of 2^-14 mov GR_exp_m2tom14 = 0x2ffff - 14 // Form signexp of -2^-14 fmpy.s1 N_0 = Norm_Arg, Inv_P_0};;{ .mmi setf.exp TWO_TO_NEG14 = GR_exp_2tom14 // Form 2^-14 setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14 nop.i 999};;//// Adjust table_ptr1 to beginning of table.// N_0 = Arg * Inv_P_0//{ .mmi add table_ptr2 = 144, table_base ;; // Point to 2^-2 ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] nop.i 999};;//// N_0_fix = integer part of N_0 .////// Make N_0 the integer part.//{ .mfi nop.m 999 fcvt.fx.s1 N_0_fix = N_0 nop.i 999 ;;}{ .mfi setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r fcvt.xf N_0 = N_0_fix nop.i 999 ;;}{ .mfi setf.sig B_mask2 = bmask2 // Form mask to form B from r fnma.s1 ArgPrime = N_0, P_0, Norm_Arg nop.i 999}{ .mfi nop.m 999 fmpy.s1 w = N_0, d_1 nop.i 999 ;;}//// ArgPrime = -N_0 * P_0 + Arg// w = N_0 * d_1////// N = ArgPrime * 2/pi//// fcvt.fx.s1 N_fix = N// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24{ .mfi nop.m 999 fma.s1 N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64 nop.i 999 ;;}// Convert integer N_fix back to normalized floating-point value.{ .mfi nop.m 999 fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated nop.i 999};;//// N is the integer part of the reduced-reduced argument.// Put the integer in a GP register.//{ .mfi getf.sig N_fix_gr = N_fix nop.f 999 nop.i 999};;//// s_val = -N*P_1 + ArgPrime// w = -N*P_2 + w//{ .mfi nop.m 999 fnma.s1 s_val = N, P_1, ArgPrime nop.i 999}{ .mfi nop.m 999 fnma.s1 w = N, P_2, w nop.i 999};;// Case 4: V_hi = N * P_2// Case 4: U_hi = N_0 * d_1{ .mfi nop.m 999 fmpy.s1 V_hi = N, P_2 // V_hi = N * P_2 for |s| < 2^-14 nop.i 999}{ .mfi nop.m 999 fmpy.s1 U_hi = N_0, d_1 // U_hi = N_0 * d_1 for |s| < 2^-14 nop.i 999};;// Case 3: r = s_val + w (Z complete)// Case 4: w = N * P_3{ .mfi nop.m 999 fadd.s1 r = s_val, w // r = s_val + w for |s| >= 2^-14 nop.i 999}{ .mfi nop.m 999 fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-14 nop.i 999};;// Case 4: A = U_hi + V_hi// Note: Worry about switched sign of V_hi, so subtract instead of add.// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)// Note: the (-) is still missing for V_hi.{ .mfi nop.m 999 fsub.s1 A = U_hi, V_hi // A = U_hi - V_hi for |s| < 2^-14 nop.i 999}{ .mfi nop.m 999 fnma.s1 V_lo = N, P_2, V_hi // V_lo = V_hi - N * P_2 for |s| < 2^-14 nop.i 999};;// Decide between case 3 and 4:// Case 3: |s| >= 2**(-14) Set p10// Case 4: |s| < 2**(-14) Set p11//// Case 4: U_lo = N_0 * d_1 - U_hi{ .mfi nop.m 999 fms.s1 U_lo = N_0, d_1, U_hi // U_lo = N_0*d_1 - U_hi for |s| < 2^-14 nop.i 999}{ .mfi nop.m 999 fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14 nop.i 999};;// Case 4: We need abs of both U_hi and V_hi - dont// worry about switched sign of V_hi.{ .mfi nop.m 999 fabs V_hiabs = V_hi // |V_hi| for |s| < 2^-14 nop.i 999}{ .mfi nop.m 999(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 nop.i 999};;// Case 3: c = s_val - r{ .mfi nop.m 999
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -