📄 _matrix44_sse.h
字号:
minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
tmp1 = _mm_mul_ps(row0, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
tmp1 = _mm_mul_ps(row0, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
det = _mm_mul_ps(row0, minor0);
det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
tmp1 = _mm_rcp_ss(det);
det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
det = _mm_shuffle_ps(det, det, 0x00);
minor0 = _mm_mul_ps(det, minor0);
_mm_storel_pi((__m64*)(src), minor0);
_mm_storeh_pi((__m64*)(src+2), minor0);
minor1 = _mm_mul_ps(det, minor1);
_mm_storel_pi((__m64*)(src+4), minor1);
_mm_storeh_pi((__m64*)(src+6), minor1);
minor2 = _mm_mul_ps(det, minor2);
_mm_storel_pi((__m64*)(src+ 8), minor2);
_mm_storeh_pi((__m64*)(src+10), minor2);
minor3 = _mm_mul_ps(det, minor3);
_mm_storel_pi((__m64*)(src+12), minor3);
_mm_storeh_pi((__m64*)(src+14), minor3);
}
//------------------------------------------------------------------------------
/**
inverts a 4x4 matrix consisting of a 3x3 rotation matrix and
a translation (eg. everything that has [0,0,0,1] as
the rightmost column) MUCH cheaper then a real 4x4 inversion
FIXME: SSE OPTIMIZATION!
*/
inline
void
_matrix44_sse::invert_simple()
{
float s = det();
if (s == 0.0f) return;
s = 1.0f/s;
this->set(
s * ((M22 * M33) - (M23 * M32)),
s * ((M32 * M13) - (M33 * M12)),
s * ((M12 * M23) - (M13 * M22)),
0.0f,
s * ((M23 * M31) - (M21 * M33)),
s * ((M33 * M11) - (M31 * M13)),
s * ((M13 * M21) - (M11 * M23)),
0.0f,
s * ((M21 * M32) - (M22 * M31)),
s * ((M31 * M12) - (M32 * M11)),
s * ((M11 * M22) - (M12 * M21)),
0.0f,
s * (M21*(M33*M42 - M32*M43) + M22*(M31*M43 - M33*M41) + M23*(M32*M41 - M31*M42)),
s * (M31*(M13*M42 - M12*M43) + M32*(M11*M43 - M13*M41) + M33*(M12*M41 - M11*M42)),
s * (M41*(M13*M22 - M12*M23) + M42*(M11*M23 - M13*M21) + M43*(M12*M21 - M11*M22)),
1.0f);
}
//------------------------------------------------------------------------------
/**
optimized multiplication, assumes that M14==M24==M34==0 AND M44==1
NOTE: On SSE, this is a normal matrix multiplication
Takes 16 muls, 12 adds and 16 shuffles.
*/
inline
void
_matrix44_sse::mult_simple(const _matrix44_sse& mx)
{
m1 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m3 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m4 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(3,3,3,3)), mx.m4));
}
//------------------------------------------------------------------------------
/**
Transforms a vector by the matrix, projecting the result back into w=1.
FIXME: SSE OPTIMIZATION!
*/
inline
_vector3_sse
_matrix44_sse::transform_coord(const _vector3_sse& v) const
{
float d = 1.0f / (M14*v.x + M24*v.y + M34*v.z + M44);
return _vector3_sse(
(M11*v.x + M21*v.y + M31*v.z + M41) * d,
(M12*v.x + M22*v.y + M32*v.z + M42) * d,
(M13*v.x + M23*v.y + M33*v.z + M43) * d);
}
//------------------------------------------------------------------------------
/**
*/
inline
_vector3_sse
_matrix44_sse::x_component() const
{
_vector3_sse v(m1);
return v;
}
//------------------------------------------------------------------------------
/**
*/
inline
_vector3_sse
_matrix44_sse::y_component() const
{
_vector3_sse v(m2);
return v;
}
//------------------------------------------------------------------------------
/**
*/
inline
_vector3_sse
_matrix44_sse::z_component() const
{
_vector3_sse v(m3);
return v;
}
//------------------------------------------------------------------------------
/**
*/
inline
_vector3_sse
_matrix44_sse::pos_component() const
{
_vector3_sse v(M41, M42, M43);
return v;
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIERUNG!
*/
inline
void
_matrix44_sse::rotate_x(const float a)
{
float c = n_cos(a);
float s = n_sin(a);
int i;
for (i=0; i<4; i++) {
float mi1 = m[i][1];
float mi2 = m[i][2];
m[i][1] = mi1*c + mi2*-s;
m[i][2] = mi1*s + mi2*c;
}
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIERUNG!
*/
inline
void
_matrix44_sse::rotate_y(const float a)
{
float c = n_cos(a);
float s = n_sin(a);
int i;
for (i=0; i<4; i++) {
float mi0 = m[i][0];
float mi2 = m[i][2];
m[i][0] = mi0*c + mi2*s;
m[i][2] = mi0*-s + mi2*c;
}
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIERUNG!
*/
inline
void
_matrix44_sse::rotate_z(const float a)
{
float c = n_cos(a);
float s = n_sin(a);
int i;
for (i=0; i<4; i++) {
float mi0 = m[i][0];
float mi1 = m[i][1];
m[i][0] = mi0*c + mi1*-s;
m[i][1] = mi0*s + mi1*c;
}
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::translate(const _vector3_sse& t)
{
m4 = _mm_add_ps(m4, t.m128);
}
//------------------------------------------------------------------------------
/**
FIXME: RAFAEL HAS NO CLUE ABOUT SSE!
*/
inline
void
_matrix44_sse::set_translation(const _vector3_sse& t)
{
m4 = t.m128;
};
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::scale(const _vector3_sse& s)
{
// _vector3_sse have the w element set to zero, we need it at 1...
__m128 scale = _mm_add_ps(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f), s.m128);
m1 = _mm_mul_ps(m1, scale);
m2 = _mm_mul_ps(m2, scale);
m3 = _mm_mul_ps(m3, scale);
m4 = _mm_mul_ps(m4, scale);
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::lookat(const _vector3_sse& to, const _vector3_sse& up)
{
_vector3_sse from(M41, M42, M43);
_vector3_sse z(from - to);
z.norm();
_vector3_sse y(up);
_vector3_sse x(y * z); // x = y cross z
y = z * x; // y = z cross x
x.norm();
y.norm();
m1 = x.m128;
m2 = y.m128;
m3 = z.m128;
}
//------------------------------------------------------------------------------
/**
*/
inline
void
_matrix44_sse::billboard(const _vector3_sse& to, const _vector3_sse& up)
{
_vector3_sse from(M41, M42, M43);
_vector3_sse z(from - to);
z.norm();
_vector3_sse y(up);
_vector3_sse x(y * z);
z = x * y;
x.norm();
y.norm();
z.norm();
m1 = x.m128;
m2 = y.m128;
m3 = z.m128;
}
//------------------------------------------------------------------------------
/**
This one uses no temp variables, this gives a 10% boost over the same
code using temp variables even with optimizations turned on.
Unfortunately it's not very readable though...
*/
inline
void
_matrix44_sse::operator *= (const _matrix44_sse& mx)
{
m1 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m3 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3,3,3,3)), mx.m4));
m4 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(0,0,0,0)), mx.m1), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(1,1,1,1)), mx.m2)), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(2,2,2,2)), mx.m3)), _mm_mul_ps(_mm_shuffle_ps(m4, m4, _MM_SHUFFLE(3,3,3,3)), mx.m4));
}
//------------------------------------------------------------------------------
/**
FIXME: SSE OPTIMIZATION!
*/
inline
void
_matrix44_sse::rotate(const _vector3_sse& vec, float a)
{
_vector3_sse v(vec);
v.norm();
float sa = (float) n_sin(a);
float ca = (float) n_cos(a);
_matrix44_sse rotM;
rotM.M11 = ca + (1.0f - ca) * v.x * v.x;
rotM.M12 = (1.0f - ca) * v.x * v.y - sa * v.z;
rotM.M13 = (1.0f - ca) * v.z * v.x + sa * v.y;
rotM.M21 = (1.0f - ca) * v.x * v.y + sa * v.z;
rotM.M22 = ca + (1.0f - ca) * v.y * v.y;
rotM.M23 = (1.0f - ca) * v.y * v.z - sa * v.x;
rotM.M31 = (1.0f - ca) * v.z * v.x - sa * v.y;
rotM.M32 = (1.0f - ca) * v.y * v.z + sa * v.x;
rotM.M33 = ca + (1.0f - ca) * v.z * v.z;
(*this) *= rotM;
}
//------------------------------------------------------------------------------
/**
Multiply source directly into target vector (without creating a
temporary vector4 object).
*/
inline
void
_matrix44_sse::mult(const _vector4_sse& src, _vector4_sse& dst) const
{
dst.m128 = _mm_add_ps(
_mm_add_ps(
_mm_add_ps(
_mm_mul_ps(m1, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(0,0,0,0))),
_mm_mul_ps(m2, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(1,1,1,1)))),
_mm_mul_ps(m3, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(2,2,2,2)))),
_mm_mul_ps(m4, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(3,3,3,3))));
}
//------------------------------------------------------------------------------
/**
Multiply source directly into target vector (without creating a
temporary vector4 object).
*/
inline
void
_matrix44_sse::mult(const _vector3_sse& src, _vector3_sse& dst) const
{
dst.m128 = _mm_add_ps(
_mm_add_ps(
_mm_add_ps(
_mm_mul_ps(m1, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(0,0,0,0))),
_mm_mul_ps(m2, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(1,1,1,1)))),
_mm_mul_ps(m3, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(2,2,2,2)))),
_mm_mul_ps(m4, _mm_shuffle_ps(src.m128, src.m128, _MM_SHUFFLE(3,3,3,3))));
}
//------------------------------------------------------------------------------
/**
*/
static
inline
_matrix44_sse
operator * (const _matrix44_sse& ma, const _matrix44_sse& mb)
{
return _matrix44_sse(
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(ma.m1, ma.m1, _MM_SHUFFLE(0,0,0,0)), mb.m1), _mm_mul_ps(_mm_shuffle_ps(ma.m1, ma.m1, _MM_SHUFFLE(1,1,1,1)), mb.m2)), _mm_mul_ps(_mm_shuffle_ps(ma.m1, ma.m1, _MM_SHUFFLE(2,2,2,2)), mb.m3)), _mm_mul_ps(_mm_shuffle_ps(ma.m1, ma.m1, _MM_SHUFFLE(3,3,3,3)), mb.m4)),
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(ma.m2, ma.m2, _MM_SHUFFLE(0,0,0,0)), mb.m1), _mm_mul_ps(_mm_shuffle_ps(ma.m2, ma.m2, _MM_SHUFFLE(1,1,1,1)), mb.m2)), _mm_mul_ps(_mm_shuffle_ps(ma.m2, ma.m2, _MM_SHUFFLE(2,2,2,2)), mb.m3)), _mm_mul_ps(_mm_shuffle_ps(ma.m2, ma.m2, _MM_SHUFFLE(3,3,3,3)), mb.m4)),
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(ma.m3, ma.m3, _MM_SHUFFLE(0,0,0,0)), mb.m1), _mm_mul_ps(_mm_shuffle_ps(ma.m3, ma.m3, _MM_SHUFFLE(1,1,1,1)), mb.m2)), _mm_mul_ps(_mm_shuffle_ps(ma.m3, ma.m3, _MM_SHUFFLE(2,2,2,2)), mb.m3)), _mm_mul_ps(_mm_shuffle_ps(ma.m3, ma.m3, _MM_SHUFFLE(3,3,3,3)), mb.m4)),
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(ma.m4, ma.m4, _MM_SHUFFLE(0,0,0,0)), mb.m1), _mm_mul_ps(_mm_shuffle_ps(ma.m4, ma.m4, _MM_SHUFFLE(1,1,1,1)), mb.m2)), _mm_mul_ps(_mm_shuffle_ps(ma.m4, ma.m4, _MM_SHUFFLE(2,2,2,2)), mb.m3)), _mm_mul_ps(_mm_shuffle_ps(ma.m4, ma.m4, _MM_SHUFFLE(3,3,3,3)), mb.m4))
);
}
//------------------------------------------------------------------------------
/**
*/
static
inline
_vector3_sse operator * (const _matrix44_sse& m, const _vector3_sse& v)
{
return _vector3_sse(
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m.m1, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(m.m2, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(m.m3, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(2,2,2,2)))), _mm_mul_ps(m.m4, _mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f)))
);
}
//------------------------------------------------------------------------------
/**
*/
static
inline
_vector4_sse operator * (const _matrix44_sse& m, const _vector4_sse& v)
{
return _vector4_sse(
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m.m1, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(m.m2, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(m.m3, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(2,2,2,2)))), _mm_mul_ps(m.m4, _mm_shuffle_ps(v.m128, v.m128, _MM_SHUFFLE(3,3,3,3))))
);
}
//------------------------------------------------------------------------------
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -