⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 math.cpp

📁 3D游戏开发需要用到BSP树来经行场景渲染的管理。本代码包含完整的BSP及文件生成实现
💻 CPP
字号:
/*
    Copyright (C) 2008 Rouslan Dimitrov

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "math.h"

#define SSE1_BIT	BIT(25)	// edx
#define SSE2_BIT	BIT(26)	// edx
#define SSE3_BIT	BIT(0)	// ecx

#define BITSHIFT(from, to)	(from - to)

int g_cpuCaps;

int __declspec(naked) __fastcall MathInit() {
	__asm {
		mov		eax, 1
		cpuid
		and		ecx, SSE3_BIT
		and		edx, (SSE1_BIT | SSE2_BIT)
		shl		ecx, -BITSHIFT(0, 2)
		shr		edx, BITSHIFT(25, 0)
		or		edx, ecx
		mov		[g_cpuCaps], edx
		and		edx, CPUCAP_SSE1
		mov		eax, edx			; return 1 if SSE1 (required) present
		ret
	}
}

static __m128 dot_ps(register __m128 a, register __m128 b) {
#if 0
	__m128 r1 = _mm_mul_ps(a, b);
	__m128 r2 = _mm_shuffle_ps(r1, r1, 0x4E);
	r1 = _mm_add_ps(r1, r2);
	r2 = _mm_shuffle_ps(r1, r1, 0x11);
	r1 = _mm_add_ps(r1, r2);
	return r1;
#else
	// slower on AMD Athlon32, faster on Intel C2
	__m128 sq = _mm_mul_ps(a, b);
	sq.m128_f32[0] = sq.m128_f32[0]
				   + sq.m128_f32[1]
				   + sq.m128_f32[2]
				   + sq.m128_f32[3];
	return sq;
#endif	
}

static __m128 inv_sq_root_ss(register __m128 r) {
	// interation is: x1 = 0.5* x0 * (3 - R*x0^2)
	const float c3 = 3.0f;
	const float c05 = 0.5f;
	__m128 x0 = _mm_rsqrt_ss(r);
	__m128 x1 = _mm_mul_ss(x0, x0);
	x1 = _mm_mul_ss(x1, r);
	x1 = _mm_sub_ss(_mm_load_ss(&c3), x1);
	x1 = _mm_mul_ss(_mm_load_ss(&c05), x1);
	x1 = _mm_mul_ss(x1, x0);
	return x1;
}

float vec4::magnitude() {
	__m128 dot = dot_ps(dataSSE, dataSSE);

	float m;
#if 1 // exact and 20% faster on AMD Athlon32
	_mm_store_ss(&m, _mm_sqrt_ss(dot));
#else // approximate
	_mm_store_ss(&m, _mm_mul_ss(dot, inv_sq_root_ss(dot)));
#endif
	return m;
}


vec4 vec4::cross3D(vec4& v) {
#if 1
	vec4 r;
	__declspec(align(16)) __m128 a1 = _mm_shuffle_ps(dataSSE, dataSSE, _MM_SHUFFLE(3,0,2,1));
	__m128 b1 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, _MM_SHUFFLE(3,1,0,2));
	__m128 a2 = _mm_shuffle_ps(dataSSE, dataSSE, _MM_SHUFFLE(3,1,0,2));
	__m128 b2 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, _MM_SHUFFLE(3,0,2,1));
	a1 = _mm_mul_ps(a1, b1);
	a2 = _mm_mul_ps(a2, b2);
	r.dataSSE = _mm_sub_ps(a1, a2);
	return r;
#else
	vec4 a = *this;
	return vec4(a.y*v.z - a.z*v.y, a.z*v.x - a.x*v.z, a.x*v.y - a.y*v.x, 0.0f);
#endif
}


void vec4::normalize() {
	__m128 dot = dot_ps(dataSSE, dataSSE);
	__m128 isr = inv_sq_root_ss(dot);
	isr = _mm_shuffle_ps(isr, isr, 0x00);
	dataSSE = _mm_mul_ps(dataSSE, isr);
}

vec4 mat4::operator*(vec4& v) {
	vec4 r;

	__m128 v0 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0x00);
	__m128 v1 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0x55);
	__m128 v2 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0xAA);
	__m128 v3 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0xFF);

	r.dataSSE = _mm_mul_ps(this->dataSSE[0], v0);
	r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[1], v1));
	r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[2], v2));
	r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[3], v3));
	
	return r;
}

#pragma warning(disable : 4700)

void mat4::TransposeSelf() {
	__m128 tmp1, tmp2, row1, row2;

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data)), (__m64*)(data+4));
	tmp2 = _mm_loadh_pi(_mm_loadl_pi(tmp2, (__m64*)(data+8)), (__m64*)(data+12));
	row1 = _mm_shuffle_ps(tmp1, tmp2, 0x88);
	row2 = _mm_shuffle_ps(tmp1, tmp2, 0xDD);

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data+2)), (__m64*)(data+6));
	tmp2 = _mm_loadh_pi(_mm_loadl_pi(tmp2, (__m64*)(data+10)), (__m64*)(data+14));
	dataSSE[0] = row1;
	dataSSE[1] = row2;
	dataSSE[2] = _mm_shuffle_ps(tmp1, tmp2, 0x88);
	dataSSE[3] = _mm_shuffle_ps(tmp1, tmp2, 0xDD);
}

mat4 mat4::InverseModelView() {
	mat4 imv;

	imv.data[0] = data[0];
	imv.data[1] = data[4];
	imv.data[2] = data[8];
	imv.data[3] = 0.0f;
	imv.data[4] = data[1];
	imv.data[5] = data[5];
	imv.data[6]  = data[9];
	imv.data[7] = 0.0f;
	imv.data[8] = data[2];
	imv.data[9] = data[6];
	imv.data[10] = data[10];
	imv.data[11] = 0.0f;
	imv.data[12] = -(data[12] * data[0]) - (data[13] * data[1]) - (data[14] * data[2]);
	imv.data[13] = -(data[12] * data[4]) - (data[13] * data[5]) - (data[14] * data[6]);
	imv.data[14] = -(data[12] * data[8]) - (data[13] * data[9]) - (data[14] * data[10]);
	imv.data[15] = 1.0f;

	return imv;
}

// The code block below is based on Intel sample code from:
// "Streaming SIMD Extensions - Inverse of 4x4 Matrix", order #245043-001, Intel 1999
// Inverse using Cramer's rule
void mat4::InverseSelf() {
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;

	// Modified Transpose
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data)), (__m64*)(data+4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(data+8)), (__m64*)(data+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data+2)), (__m64*)(data+6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(data+10)), (__m64*)(data+14));
	row2= _mm_shuffle_ps(tmp1, row3, 0x88);
	row3= _mm_shuffle_ps(row3, tmp1, 0xDD);
	//	Cofactors
	tmp1= _mm_mul_ps(row2, row3);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0= _mm_mul_ps(row1, tmp1);
	minor1= _mm_mul_ps(row0, tmp1);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1= _mm_shuffle_ps(minor1, minor1, 0x4E);
	//-----------------------------------------------
	tmp1= _mm_mul_ps(row1, row2);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3= _mm_mul_ps(row0, tmp1);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3= _mm_shuffle_ps(minor3, minor3, 0x4E);
	//-----------------------------------------------
	tmp1= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2= _mm_shuffle_ps(row2, row2, 0x4E);
	minor0= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2= _mm_mul_ps(row0, tmp1);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2= _mm_shuffle_ps(minor2, minor2, 0x4E);
	//-----------------------------------------------
	tmp1= _mm_mul_ps(row0, row1);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	//-----------------------------------------------
	tmp1= _mm_mul_ps(row0, row3);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	//-----------------------------------------------
	tmp1= _mm_mul_ps(row0, row2);
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// Determinant and its reciprocal
	det= _mm_mul_ps(row0, minor0);
	det= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1= _mm_rcp_ss(det);
	det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det= _mm_shuffle_ps(det, det, 0x00);
	// Multiply & store
	minor0= _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(data), minor0);
	_mm_storeh_pi((__m64*)(data+2), minor0);
	minor1= _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(data+4), minor1);
	_mm_storeh_pi((__m64*)(data+6), minor1);
	minor2= _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(data+8), minor2);
	_mm_storeh_pi((__m64*)(data+10), minor2);
	minor3= _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(data+12), minor3);
	_mm_storeh_pi((__m64*)(data+14), minor3);
}

#pragma warning(default : 4700)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -