📄 s_asinh.s
字号:
.file "asinh.s"// Copyright (c) 2000 - 2005, Intel Corporation// All rights reserved.//// Contributed 2000 by the Intel Numerics Group, Intel Corporation//// Redistribution and use in source and binary forms, with or without// modification, are permitted provided that the following conditions are// met://// * Redistributions of source code must retain the above copyright// notice, this list of conditions and the following disclaimer.//// * Redistributions in binary form must reproduce the above copyright// notice, this list of conditions and the following disclaimer in the// documentation and/or other materials provided with the distribution.//// * The name of Intel Corporation may not be used to endorse or promote// products derived from this software without specific prior written// permission.// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.//// Intel Corporation is the author of this code, and requests that all// problem reports or change requests be submitted to it directly at// http://www.intel.com/software/products/opensource/libraries/num.htm.//// ==============================================================// History// ==============================================================// 04/02/01 Initial version// 04/19/01 Improved speed of the paths #1,2,3,4,5// 10/18/01 Improved accuracy// 05/20/02 Cleaned up namespace and sf0 syntax// 02/06/03 Reordered header: .section, .global, .proc, .align// 05/21/03 Improved performance, fixed to handle unorms// 03/31/05 Reformatted delimiters between data tables//// API// ==============================================================// double asinh(double)//// Overview of operation// ==============================================================//// There are 7 paths:// 1. x = 0.0// Return asinh(x) = 0.0//// 2. 0.0 <|x| < 2^(-3)// Return asinh(x) = POL13(x),// where POL13(x) = (x^2*C13 + ...)*x^2 + C5)*x^2 + C3)*x^3 + x//// 3. 2^(-3) <= |x| < 2^63// Return asinh(x) = sign(x)*(log(|x| + sqrt(x^2 + 1.0)))// To compute x + sqrt(x^2 + 1.0) modified Newton Raphson method is used// (3 iterations)// Algorithm description for log function see below.//// 4. 2^63 <= |x| < +INF// Return asinh(x) = sign(x)*log(2*|x|)// Algorithm description for log function see below.//// 5. x = INF// Return asinh(x) = INF//// 6. x = [S,Q]NaN// Return asinh(x) = QNaN//// 7. x = denormal// Return asinh(x) = x correctly rounded////==============================================================// Algorithm Description for log(x) function// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always// true for this asinh implementation//// Consider x = 2^N 1.f1 f2 f3 f4...f63// Log(x) = log(frcpa(x) x/frcpa(x))// = log(1/frcpa(x)) + log(frcpa(x) x)// = -log(frcpa(x)) + log(frcpa(x) x)//// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)//// -log(frcpa(x)) = -log(C)// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))//// -log(frcpa(x)) = -log(C)// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))//// -log(frcpa(x)) = -log(C)// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))//// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)//// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)// Log(x) = +Nlog2 + T + log(frcpa(x) x)//// Log(x) = +Nlog2 + T + log(C x)//// Cx = 1 + r//// Log(x) = +Nlog2 + T + log(1+r)// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)//// 1.f1 f2 ... f8 has 256 entries.// They are 1 + k/2^8, k = 0 ... 255// These 256 values are the table entries.//// Implementation//==============================================================// C = frcpa(x)// r = C * x - 1//// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6//// x = f * 2*n where f is 1.f_1f_2f_3....f_63// Nfloat = float(n) where n is the true unbiased exponent// pre-index = f_1f_2....f_8// index = pre_index * 16// get the dxt table entry at index + offset = T//// result = (T + Nfloat * log(2)) + rseries//// The T table is calculated as follows// Form x_k = 1 + k/2^8 where k goes from 0... 255// y_k = frcpa(x_k)// log(1/y_k) in quad and round to double-extended////// Registers used//==============================================================// Floating Point registers used:// f8, input// f9 -> f15, f32 -> f68// General registers used:// r14 -> r27// Predicate registers used:// p6 -> p14// p6 to filter out case when x = [Q,S]NaN or INF or zero// p7 to filter out case when x < 0.0// p8 to select path #2// p9 used in the frcpa from path #3// p11 to filter out case when x >= 0// p12 to filter out case when x = unorm// p13 to select path #4// Assembly macros//==============================================================log_GR_exp_17_ones = r14log_GR_signexp_f8 = r15log_table_address2 = r16log_GR_exp_16_ones = r17log_GR_exp_f8 = r18log_GR_true_exp_f8 = r19log_GR_significand_f8 = r20log_GR_index = r21log_GR_comp2 = r22asinh_GR_f8 = r23asinh_GR_comp = r24asinh_GR_f8 = r25log_table_address3 = r26NR_table_address = r27//==============================================================log_y = f9NR1 = f10NR2 = f11log_y_rs = f12log_y_rs_iter = f13log_y_rs_iter1 = f14fNormX = f15asinh_w_sq = f32log_C13 = f33log_C11 = f34log_P3 = f35log_P2 = f36log_P1 = f37log_P5 = f38log_P4 = f39log_C3 = f40log_C5 = f41log_C7 = f42log2 = f43asinh_f8 = f44log_C = f45log_arg = f46log_C9 = f47asinh_w_four = f48log_int_Nfloat = f49log_r = f50log_rsq = f51log_rp_p4 = f52log_rp_p32 = f53log_rcube = f54log_rp_p10 = f55log_rp_p2 = f56log_Nfloat = f57log_T = f58log_r2P_r = f59log_T_plus_Nlog2 = f60asinh_w_3 = f61asinh_w_5 = f62asinh_w_cube = f63asinh_w_7 = f64log_arg_early = f65asinh_w_9 = f66asinh_w_13 = f67asinh_w_seven = f68// Data tables//==============================================================RODATA.align 16LOCAL_OBJECT_START(log_table_1)data8 0xBFC5555DA7212371 // P5data8 0x3FC999A19EEF5826 // P4data8 0xBFCFFFFFFFFEF009 // P3data8 0x3FD555555554ECB2 // P2data8 0xBFE0000000000000 // P1 = -0.5data8 0x0000000000000000 // paddata8 0xb17217f7d1cf79ac, 0x00003ffe // log2LOCAL_OBJECT_END(log_table_1)LOCAL_OBJECT_START(log_table_2)data8 0x3FE0000000000000 // 0.5data8 0x4008000000000000 // 3.0//data8 0x8824BE4D74BC4F00, 0x00003FF9 // C13data8 0xB725A2CD9556CC57, 0x0000BFF9 // C11data8 0xF8E339127FBFF49D, 0x00003FF9 // C9data8 0xB6DB6D7DCE17CB78, 0x0000BFFA // C7data8 0x999999998802CCEF, 0x00003FFB // C5data8 0xAAAAAAAAAAA8DC40, 0x0000BFFC // C3LOCAL_OBJECT_END(log_table_2)LOCAL_OBJECT_START(log_table_3)data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8))//data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8))data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8))data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8))data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8))data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8))//data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8))data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8))data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8))data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8))data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8))//data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8))data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8))data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8))data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8))data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8))//data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8))data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8))data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8))data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8))data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8))//data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8))data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8))data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8))data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8))data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8))//data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8))data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8))data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8))data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8))data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8))//data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8))data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8))data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8))data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8))data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8))//data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8))data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8))data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8))data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8))data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8))//data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8))data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8))data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8))data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8))data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8))//data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8))data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8))data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8))data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8))data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8))//data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8))data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8))data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8))data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8))data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8))//data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8))data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8))data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8))data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8))data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8))//data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8))data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8))data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8))data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8))data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8))//data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8))data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8))data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8))data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8))data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8))//data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8))data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8))data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8))data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8))data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8))//data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8))data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8))data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8))data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8))data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8))//data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8))data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8))data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8))data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8))data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8))//data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8))data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8))data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8))data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8))data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8))//data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8))data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8))data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8))data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8))data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8))//data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8))data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8))data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8))data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8))data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8))//data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8))data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8))data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -