📄 math.cpp
字号:
/*
Copyright (C) 2008 Rouslan Dimitrov
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "math.h"
#define SSE1_BIT BIT(25) // edx
#define SSE2_BIT BIT(26) // edx
#define SSE3_BIT BIT(0) // ecx
#define BITSHIFT(from, to) (from - to)
int g_cpuCaps;
int __declspec(naked) __fastcall MathInit() {
__asm {
mov eax, 1
cpuid
and ecx, SSE3_BIT
and edx, (SSE1_BIT | SSE2_BIT)
shl ecx, -BITSHIFT(0, 2)
shr edx, BITSHIFT(25, 0)
or edx, ecx
mov [g_cpuCaps], edx
and edx, CPUCAP_SSE1
mov eax, edx ; return 1 if SSE1 (required) present
ret
}
}
static __m128 dot_ps(register __m128 a, register __m128 b) {
#if 0
__m128 r1 = _mm_mul_ps(a, b);
__m128 r2 = _mm_shuffle_ps(r1, r1, 0x4E);
r1 = _mm_add_ps(r1, r2);
r2 = _mm_shuffle_ps(r1, r1, 0x11);
r1 = _mm_add_ps(r1, r2);
return r1;
#else
// slower on AMD Athlon32, faster on Intel C2
__m128 sq = _mm_mul_ps(a, b);
sq.m128_f32[0] = sq.m128_f32[0]
+ sq.m128_f32[1]
+ sq.m128_f32[2]
+ sq.m128_f32[3];
return sq;
#endif
}
static __m128 inv_sq_root_ss(register __m128 r) {
// interation is: x1 = 0.5* x0 * (3 - R*x0^2)
const float c3 = 3.0f;
const float c05 = 0.5f;
__m128 x0 = _mm_rsqrt_ss(r);
__m128 x1 = _mm_mul_ss(x0, x0);
x1 = _mm_mul_ss(x1, r);
x1 = _mm_sub_ss(_mm_load_ss(&c3), x1);
x1 = _mm_mul_ss(_mm_load_ss(&c05), x1);
x1 = _mm_mul_ss(x1, x0);
return x1;
}
float vec4::magnitude() {
__m128 dot = dot_ps(dataSSE, dataSSE);
float m;
#if 1 // exact and 20% faster on AMD Athlon32
_mm_store_ss(&m, _mm_sqrt_ss(dot));
#else // approximate
_mm_store_ss(&m, _mm_mul_ss(dot, inv_sq_root_ss(dot)));
#endif
return m;
}
vec4 vec4::cross3D(vec4& v) {
#if 1
vec4 r;
__declspec(align(16)) __m128 a1 = _mm_shuffle_ps(dataSSE, dataSSE, _MM_SHUFFLE(3,0,2,1));
__m128 b1 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, _MM_SHUFFLE(3,1,0,2));
__m128 a2 = _mm_shuffle_ps(dataSSE, dataSSE, _MM_SHUFFLE(3,1,0,2));
__m128 b2 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, _MM_SHUFFLE(3,0,2,1));
a1 = _mm_mul_ps(a1, b1);
a2 = _mm_mul_ps(a2, b2);
r.dataSSE = _mm_sub_ps(a1, a2);
return r;
#else
vec4 a = *this;
return vec4(a.y*v.z - a.z*v.y, a.z*v.x - a.x*v.z, a.x*v.y - a.y*v.x, 0.0f);
#endif
}
void vec4::normalize() {
__m128 dot = dot_ps(dataSSE, dataSSE);
__m128 isr = inv_sq_root_ss(dot);
isr = _mm_shuffle_ps(isr, isr, 0x00);
dataSSE = _mm_mul_ps(dataSSE, isr);
}
vec4 mat4::operator*(vec4& v) {
vec4 r;
__m128 v0 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0x00);
__m128 v1 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0x55);
__m128 v2 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0xAA);
__m128 v3 = _mm_shuffle_ps(v.dataSSE, v.dataSSE, 0xFF);
r.dataSSE = _mm_mul_ps(this->dataSSE[0], v0);
r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[1], v1));
r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[2], v2));
r.dataSSE = _mm_add_ps(r.dataSSE, _mm_mul_ps(this->dataSSE[3], v3));
return r;
}
#pragma warning(disable : 4700)
void mat4::TransposeSelf() {
__m128 tmp1, tmp2, row1, row2;
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data)), (__m64*)(data+4));
tmp2 = _mm_loadh_pi(_mm_loadl_pi(tmp2, (__m64*)(data+8)), (__m64*)(data+12));
row1 = _mm_shuffle_ps(tmp1, tmp2, 0x88);
row2 = _mm_shuffle_ps(tmp1, tmp2, 0xDD);
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data+2)), (__m64*)(data+6));
tmp2 = _mm_loadh_pi(_mm_loadl_pi(tmp2, (__m64*)(data+10)), (__m64*)(data+14));
dataSSE[0] = row1;
dataSSE[1] = row2;
dataSSE[2] = _mm_shuffle_ps(tmp1, tmp2, 0x88);
dataSSE[3] = _mm_shuffle_ps(tmp1, tmp2, 0xDD);
}
mat4 mat4::InverseModelView() {
mat4 imv;
imv.data[0] = data[0];
imv.data[1] = data[4];
imv.data[2] = data[8];
imv.data[3] = 0.0f;
imv.data[4] = data[1];
imv.data[5] = data[5];
imv.data[6] = data[9];
imv.data[7] = 0.0f;
imv.data[8] = data[2];
imv.data[9] = data[6];
imv.data[10] = data[10];
imv.data[11] = 0.0f;
imv.data[12] = -(data[12] * data[0]) - (data[13] * data[1]) - (data[14] * data[2]);
imv.data[13] = -(data[12] * data[4]) - (data[13] * data[5]) - (data[14] * data[6]);
imv.data[14] = -(data[12] * data[8]) - (data[13] * data[9]) - (data[14] * data[10]);
imv.data[15] = 1.0f;
return imv;
}
// The code block below is based on Intel sample code from:
// "Streaming SIMD Extensions - Inverse of 4x4 Matrix", order #245043-001, Intel 1999
// Inverse using Cramer's rule
void mat4::InverseSelf() {
__m128 minor0, minor1, minor2, minor3;
__m128 row0, row1, row2, row3;
__m128 det, tmp1;
// Modified Transpose
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data)), (__m64*)(data+4));
row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(data+8)), (__m64*)(data+12));
row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(data+2)), (__m64*)(data+6));
row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(data+10)), (__m64*)(data+14));
row2= _mm_shuffle_ps(tmp1, row3, 0x88);
row3= _mm_shuffle_ps(row3, tmp1, 0xDD);
// Cofactors
tmp1= _mm_mul_ps(row2, row3);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0= _mm_mul_ps(row1, tmp1);
minor1= _mm_mul_ps(row0, tmp1);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
minor1= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
minor1= _mm_shuffle_ps(minor1, minor1, 0x4E);
//-----------------------------------------------
tmp1= _mm_mul_ps(row1, row2);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
minor3= _mm_mul_ps(row0, tmp1);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
minor3= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
minor3= _mm_shuffle_ps(minor3, minor3, 0x4E);
//-----------------------------------------------
tmp1= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
row2= _mm_shuffle_ps(row2, row2, 0x4E);
minor0= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
minor2= _mm_mul_ps(row0, tmp1);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
minor2= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
minor2= _mm_shuffle_ps(minor2, minor2, 0x4E);
//-----------------------------------------------
tmp1= _mm_mul_ps(row0, row1);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor2= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor2= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
//-----------------------------------------------
tmp1= _mm_mul_ps(row0, row3);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
minor2= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
minor2= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
//-----------------------------------------------
tmp1= _mm_mul_ps(row0, row2);
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
minor3= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
minor3= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
// Determinant and its reciprocal
det= _mm_mul_ps(row0, minor0);
det= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
det= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
tmp1= _mm_rcp_ss(det);
det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
det= _mm_shuffle_ps(det, det, 0x00);
// Multiply & store
minor0= _mm_mul_ps(det, minor0);
_mm_storel_pi((__m64*)(data), minor0);
_mm_storeh_pi((__m64*)(data+2), minor0);
minor1= _mm_mul_ps(det, minor1);
_mm_storel_pi((__m64*)(data+4), minor1);
_mm_storeh_pi((__m64*)(data+6), minor1);
minor2= _mm_mul_ps(det, minor2);
_mm_storel_pi((__m64*)(data+8), minor2);
_mm_storeh_pi((__m64*)(data+10), minor2);
minor3= _mm_mul_ps(det, minor3);
_mm_storel_pi((__m64*)(data+12), minor3);
_mm_storeh_pi((__m64*)(data+14), minor3);
}
#pragma warning(default : 4700)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -