📄 qsqrt.h

📁 用于GPU通用计算的编程语言BrookGPU 0.4
💻 H
字号:
/*    Copyright (C) 2000 by Andrew Zabolotny (Intel version)    Copyright (C) 2002 by Matthew Reda <reda@mac.com> (PowerPC version)    Fast computation of sqrt(x) and 1/sqrt(x)      This library is free software; you can redistribute it and/or    modify it under the terms of the GNU Library General Public    License as published by the Free Software Foundation; either    version 2 of the License, or (at your option) any later version.      This library is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    Library General Public License for more details.      You should have received a copy of the GNU Library General Public    License along with this library; if not, write to the Free    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/// Uncomment the following line to define CS_NO_QSQRT if you experience// mysterious problems with CS which you think are related to this// version of sqrt not behaving properly. If you find something like// that I'd like to be notified of this so we can make sure this really// is the problem.//#define CS_NO_QSQRT#ifndef __QSQRT_H__#define __QSQRT_H__#if (!defined (CS_NO_QSQRT)) && defined (PROC_X86) && defined (COMP_GCC)/*  NB: Single-precision floating-point format (32 bits):	SEEEEEEE.EMMMMMMM.MMMMMMMM.MMMMMMMM	S: Sign (0 - positive, 1 - negative)	E: Exponent (plus 127, 8 bits)	M: Mantissa (23 bits)*//** * This is a relatively small inline routine which will compute sqrt(x) * very quick, taking a square root is now just a little longer than * doing a division. The function is inline so that it runs at maximal * possible speed. It does eight multiplications but multiplication is * cheap on P5+ processors (3 clocks). On a Celeron CPU it takes * approximatively 50 clocks, while a division is 42 clocks. */static inline float qsqrt (float x){  float ret;// Original C++ formulae:// float tmp = x;// *((unsigned *)&tmp) = (0xbe6f0000 - *((unsigned *)&tmp)) >> 1;// double h = x * 0.5;// double a = tmp;// a *= 1.5 - a * a * h;// a *= 1.5 - a * a * h;// return a * x;  __asm__ (		"flds	%1\n"			// x		"movl	$0xbe6f0000,%%eax\n"		"subl	%1,%%eax\n"		"shrl	$1,%%eax\n"		"movl	%%eax,%1\n"		"flds	%2\n"			// x 0.5		"fmul	%%st(1)\n"		// x h		"flds	%3\n"			// x h 1.5		"flds	%1\n"			// x h 1.5 a		"fld	%%st\n"			// x h 1.5 a a		"fmul	%%st\n"			// x h 1.5 a a*a		"fmul	%%st(3)\n"		// x h 1.5 a a*a*h		"fsubr	%%st(2)\n"		// x h 1.5 a 1.5-a*a*h		"fmulp	%%st(1)\n"		// x h 1.5 a		"fld	%%st\n"			// x h 1.5 a a		"fmul	%%st\n"			// x h 1.5 a a*a		"fmulp	%%st(3)\n"		// x a*a*h 1.5 a		"fxch\n"			// x a*a*h a 1.5		"fsubp  %%st,%%st(2)\n"		// x 1.5-a*a*h a		"fmulp	%%st(1)\n"		// x a		"fmulp	%%st(1)\n"		// a	: "=&t" (ret), "+m" (x) : "m" (0.5F), "m" (1.5F)	: "eax", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"  );  return ret;}/** * This routine is basically equivalent to qsqrt() except that it returns * 1/sqrt(x) rather than the proper square root. It should be used anywhere * you need the inverse root (in 3D graphics it is a common situation), * because the routine is a little faster than qsqrt() and also you avoid * a division (which gives you an overall 2X speedup). */static inline float qisqrt (float x){  float ret;  __asm__ (		"flds	%1\n"			// x		"movl	$0xbe6f0000,%%eax\n"		"subl	%1,%%eax\n"		"shrl	$1,%%eax\n"		"movl	%%eax,%1\n"		"flds	%2\n"			// x 0.5		"fmulp	%%st(1)\n"		// h		"flds	%3\n"			// h 1.5		"flds	%1\n"			// h 1.5 a		"fld	%%st\n"			// h 1.5 a a		"fmul	%%st\n"			// h 1.5 a a*a		"fmul	%%st(3)\n"		// h 1.5 a a*a*h		"fsubr	%%st(2)\n"		// h 1.5 a 1.5-a*a*h		"fmulp	%%st(1)\n"		// h 1.5 a		"fld	%%st\n"			// h 1.5 a a		"fmul	%%st\n"			// h 1.5 a a*a		"fmulp	%%st(3)\n"		// a*a*h 1.5 a		"fxch\n"			// a*a*h a 1.5		"fsubp  %%st,%%st(2)\n"		// 1.5-a*a*h a		"fmulp	%%st(1)\n"		// a	: "=t" (ret), "+m" (x) : "m" (0.5F), "m" (1.5F)	: "eax", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"  );  return ret;}#elif (!defined (CS_NO_QSQRT)) && defined (PROC_POWERPC) && defined (COMP_GCC)/** * Use the PowerPC fsqrte to get an estimate of 1/sqrt(x) Then apply two * Newton-Rhaphson refinement steps to get a more accurate response Finally * multiply by x to get x/sqrt(x) = sqrt(x).  Add additional refinement steps * to get a more accurate result.  Zero is treated as a special case, otherwise * we end up returning NaN (Not a Number). */static inline float qsqrt(float x){  float y0 = 0.0;  if (x != 0.0)  {    float x0 = x * 0.5f;    asm ("frsqrte %0,%1" : "=f" (y0) : "f" (x));        y0 = y0 * (1.5f - x0 * y0 * y0);    y0 = (y0 * (1.5f - x0 * y0 * y0)) * x;  };      return y0;};/** * Similar to qsqrt() above, except we do not multiply by x at the end, and * return 1/sqrt(x). */static inline float qisqrt(float x){  float x0 = x * 0.5f;  float y0;  asm ("frsqrte %0,%1" : "=f" (y0) : "f" (x));      y0 = y0 * (1.5f - x0 * y0 * y0);  y0 = y0 * (1.5f - x0 * y0 * y0);  return y0;};#else#include <math.h>#define qsqrt(x)  sqrt(x)#define qisqrt(x) (1.0/sqrt(x))#endif#endif // __QSQRT_H__
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -