📄 _matrix44_sse.h
字号:
#ifndef _MATRIX44_SSE_H
#define _MATRIX44_SSE_H
//------------------------------------------------------------------------------
/**
SSE based matrix44 class.
@author
- RadonLabs GmbH
@since
- 2005.7.06
@remarks
- 瘤肯 眠啊
*/
#include <xmmintrin.h>
#include "_vector4_sse.h"
#include "_vector3_sse.h"
#include "quaternion.h"
#include "euler.h"
#include "matrixdefs.h"
static float _matrix44_sse_ident[16] =
{
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f,
};
//------------------------------------------------------------------------------
class _matrix44_sse
{
public:
/// constructor 1
_matrix44_sse();
/// constructor 2
_matrix44_sse(const _vector4_sse& v1, const _vector4_sse& v2, const _vector4_sse& v3, const _vector4_sse& v4);
/// constructor 3
_matrix44_sse(const _matrix44_sse& m1);
/// constructor 4
_matrix44_sse(float _m11, float _m12, float _m13, float _m14,
float _m21, float _m22, float _m23, float _m24,
float _m31, float _m32, float _m33, float _m34,
float _m41, float _m42, float _m43, float _m44);
/// construct from quaternion
_matrix44_sse(const quaternion& q);
/// construct from sse variables
_matrix44_sse(const __m128& _m1, const __m128& _m2, const __m128& _m3, const __m128& _m4);
/// convert to quaternion
quaternion get_quaternion() const;
/// set 1
void set(const _vector4_sse& v1, const _vector4_sse& v2, const _vector4_sse& v3, const _vector4_sse& v4);
/// set 2
void set(const _matrix44_sse& m1);
/// set 3
void set(float _m11, float _m12, float _m13, float _m14,
float _m21, float _m22, float _m23, float _m24,
float _m31, float _m32, float _m33, float _m34,
float _m41, float _m42, float _m43, float _m44);
/// set from quaternion
void set(const quaternion& q);
/// set to identity
void ident();
/// transpose
void transpose();
/// determinant
float det();
/// full invert
void invert(void);
/// quick invert (if 3x3 rotation and translation)
void invert_simple(void);
/// quick multiplication, assumes that M14==M24==M34==0 and M44==1
void mult_simple(const _matrix44_sse& m1);
/// transform vector3, projecting back into w=1
_vector3_sse transform_coord(const _vector3_sse& v) const;
/// return x component
_vector3_sse x_component() const;
/// return y component
_vector3_sse y_component() const;
/// return z component
_vector3_sse z_component() const;
/// return translate component
_vector3_sse pos_component() const;
/// rotate around global x
void rotate_x(const float a);
/// rotate around global y
void rotate_y(const float a);
/// rotate around global z
void rotate_z(const float a);
/// rotate about any axis
void rotate(const _vector3_sse& vec, float a);
/// translate
void translate(const _vector3_sse& t);
/// set absolute translation
void set_translation(const _vector3_sse& t);
/// scale
void scale(const _vector3_sse& s);
/// unrestricted lookat
void lookat(const _vector3_sse& to, const _vector3_sse& up);
/// restricted lookat
void billboard(const _vector3_sse& to, const _vector3_sse& up);
/// inplace matrix mulitply
void operator *= (const _matrix44_sse& m1);
/// multiply source vector into target vector, eliminates tmp vector
void mult(const _vector4_sse& src, _vector4_sse& dst) const;
/// multiply source vector into target vector, eliminates tmp vector
void mult(const _vector3_sse& src, _vector3_sse& dst) const;
union
{
struct
{
__m128 m1;
__m128 m2;
__m128 m3;
__m128 m4;
};
struct
{
float m[4][4];
};
};
};
//------------------------------------------------------------------------------
/**
*/
inline
_matrix44_sse::_matrix44_sse()
{
memcpy(&(m[0][0]), _matrix44_sse_ident, sizeof(_matrix44_sse_ident));
}
//------------------------------------------------------------------------------
/**
*/
inline
_matrix44_sse::_matrix44_sse(const _vector4_sse& v1, const _vector4_sse& v2, const _vector4_sse& v3, const _vector4_sse& v4) :
m1(v1.m128), m2(v2.m128), m3(v3.m128), m4(v4.m128)
{
// empty
}
//------------------------------------------------------------------------------
/**
*/
inline
_matrix44_sse::_matrix44_sse(const _matrix44_sse& mx) :
m1(mx.m1), m2(mx.m2), m3(mx.m3), m4(mx.m4)
{
// empty
}
//------------------------------------------------------------------------------
/**
*/
inline
_matrix44_sse::_matrix44_sse(float _m11, float _m12, float _m13, float _m14,
float _m21, float _m22, float _m23, float _m24,
float _m31, float _m32, float _m33, float _m34,
float _m41, float _m42, float _m43, float _m44)
{
m1 = _mm_set_ps(_m14, _m13, _m12, _m11);
m2 = _mm_set_ps(_m24, _m23, _m22, _m21);
m3 = _mm_set_ps(_m34, _m33, _m32, _m31);
m4 = _mm_set_ps(_m44, _m43, _m42, _m41);
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIZATION MISSING!
*/
inline
_matrix44_sse::_matrix44_sse(const quaternion& q)
{
float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
x2 = q.x + q.x; y2 = q.y + q.y; z2 = q.z + q.z;
xx = q.x * x2; xy = q.x * y2; xz = q.x * z2;
yy = q.y * y2; yz = q.y * z2; zz = q.z * z2;
wx = q.w * x2; wy = q.w * y2; wz = q.w * z2;
m[0][0] = 1.0f - (yy + zz);
m[1][0] = xy - wz;
m[2][0] = xz + wy;
m[0][1] = xy + wz;
m[1][1] = 1.0f - (xx + zz);
m[2][1] = yz - wx;
m[0][2] = xz - wy;
m[1][2] = yz + wx;
m[2][2] = 1.0f - (xx + yy);
m[3][0] = m[3][1] = m[3][2] = 0.0f;
m[0][3] = m[1][3] = m[2][3] = 0.0f;
m[3][3] = 1.0f;
}
//------------------------------------------------------------------------------
/**
*/
inline
_matrix44_sse::_matrix44_sse(const __m128& _m1, const __m128& _m2, const __m128& _m3, const __m128& _m4) :
m1(_m1), m2(_m2), m3(_m3), m4(_m4)
{
// empty
}
//------------------------------------------------------------------------------
/**
convert orientation of 4x4 matrix into quaterion,
4x4 matrix must not be scaled!
FIXME: SSE OPTIMIZATION MISSING!
*/
inline
quaternion
_matrix44_sse::get_quaternion() const
{
float qa[4];
float tr = m[0][0] + m[1][1] + m[2][2];
if (tr > 0.0f)
{
float s = n_sqrt (tr + 1.0f);
qa[3] = s * 0.5f;
s = 0.5f / s;
qa[0] = (m[1][2] - m[2][1]) * s;
qa[1] = (m[2][0] - m[0][2]) * s;
qa[2] = (m[0][1] - m[1][0]) * s;
}
else
{
int i, j, k, nxt[3] = {1,2,0};
i = 0;
if (m[1][1] > m[0][0]) i=1;
if (m[2][2] > m[i][i]) i=2;
j = nxt[i];
k = nxt[j];
float s = n_sqrt((m[i][i] - (m[j][j] + m[k][k])) + 1.0f);
qa[i] = s * 0.5f;
s = 0.5f / s;
qa[3] = (m[j][k] - m[k][j])* s;
qa[j] = (m[i][j] + m[j][i]) * s;
qa[k] = (m[i][k] + m[k][i]) * s;
}
quaternion q(qa[0],qa[1],qa[2],qa[3]);
return q;
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::set(const _vector4_sse& v1, const _vector4_sse& v2, const _vector4_sse& v3, const _vector4_sse& v4)
{
m1 = v1.m128;
m2 = v2.m128;
m3 = v3.m128;
m4 = v4.m128;
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::set(const _matrix44_sse& mx)
{
m1 = mx.m1;
m2 = mx.m2;
m3 = mx.m3;
m4 = mx.m4;
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::set(float _m11, float _m12, float _m13, float _m14,
float _m21, float _m22, float _m23, float _m24,
float _m31, float _m32, float _m33, float _m34,
float _m41, float _m42, float _m43, float _m44)
{
m1 = _mm_set_ps(_m14, _m13, _m12, _m11);
m2 = _mm_set_ps(_m24, _m23, _m22, _m21);
m3 = _mm_set_ps(_m34, _m33, _m32, _m31);
m4 = _mm_set_ps(_m44, _m43, _m42, _m41);
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIZATION MISSING!
*/
inline
void
_matrix44_sse::set(const quaternion& q)
{
float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
x2 = q.x + q.x; y2 = q.y + q.y; z2 = q.z + q.z;
xx = q.x * x2; xy = q.x * y2; xz = q.x * z2;
yy = q.y * y2; yz = q.y * z2; zz = q.z * z2;
wx = q.w * x2; wy = q.w * y2; wz = q.w * z2;
m[0][0] = 1.0f - (yy + zz);
m[1][0] = xy - wz;
m[2][0] = xz + wy;
m[0][1] = xy + wz;
m[1][1] = 1.0f - (xx + zz);
m[2][1] = yz - wx;
m[0][2] = xz - wy;
m[1][2] = yz + wx;
m[2][2] = 1.0f - (xx + yy);
m[3][0] = m[3][1] = m[3][2] = 0.0f;
m[0][3] = m[1][3] = m[2][3] = 0.0f;
m[3][3] = 1.0f;
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::ident()
{
memcpy(&(m[0][0]), _matrix44_sse_ident, sizeof(_matrix44_sse_ident));
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::transpose()
{
_MM_TRANSPOSE4_PS(m1, m2, m3, m4);
}
//------------------------------------------------------------------------------
/**
FIXME: OPTIMIZE FOR SSE!
*/
inline
float
_matrix44_sse::det()
{
return
(M11 * M22 - M12 * M21) * (M33 * M44 - M34 * M43)
-(M11 * M23 - M13 * M21) * (M32 * M44 - M34 * M42)
+(M11 * M24 - M14 * M21) * (M32 * M43 - M33 * M42)
+(M12 * M23 - M13 * M22) * (M31 * M44 - M34 * M41)
-(M12 * M24 - M14 * M22) * (M31 * M43 - M33 * M41)
+(M13 * M24 - M14 * M23) * (M31 * M42 - M32 * M41);
}
//------------------------------------------------------------------------------
/**
Code taken from Intel pdf "Streaming SIMD Extension - Inverse of 4x4 Matrix"
*/
inline
void
_matrix44_sse::invert()
{
float* src = &(m[0][0]);
__m128 minor0, minor1, minor2, minor3;
__m128 row0, row1, row2, row3;
__m128 det, tmp1;
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
tmp1 = _mm_mul_ps(row2, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0 = _mm_mul_ps(row1, tmp1);
minor1 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
tmp1 = _mm_mul_ps(row1, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
minor3 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
row2 = _mm_shuffle_ps(row2, row2, 0x4E);
minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
minor2 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
tmp1 = _mm_mul_ps(row0, row1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -