📄 ivp_great_matrix.cxx
字号:
// Copyright (C) Ipion Software GmbH 1999-2000. All rights reserved.
#ifdef IVP_WILLAMETTE
#include <emmintrin.h>
#endif
// IVP_EXPORT_PRIVATE
#include <ivp_physics.hxx>
#if defined(LINUX) || defined(SUN) || (__MWERKS__ && __POWERPC__)
# include <alloca.h>
#endif
#include <ivp_great_matrix.hxx>
#include <ivp_debug_manager.hxx>
#include <ivu_memory.hxx>
#define IVP_UNILATERAL_SOLVER_MAX_STEP 250
#define IVP_STATIC_FRICTION_TEST_EPS 0.00001f //is a percentage value
#define COMPLEX_EPS 10E-6f
#define IVP_NUMERIC_TEST_EVERY_N 7
inline void IVP_VecFPU::fpu_add_multiple_row(IVP_DOUBLE *target_adress,IVP_DOUBLE *source_adress,IVP_DOUBLE factor,int size,IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
//we have to calculate the block size and shift adresses to lower aligned adresses
long result_adress = long(source_adress) & IVP_VECFPU_MEM_MASK;
target_adress = (IVP_DOUBLE *)( long (target_adress) & IVP_VECFPU_MEM_MASK);
size += (long(source_adress)-result_adress)>>IVP_VECFPU_MEMSHIFT;
source_adress=(IVP_DOUBLE *)result_adress;
}
#if defined(IVP_USE_PS2_VU0)
asm __volatile__("
mfc1 $8,%3
qmtc2 $8,vf5 # vf5.x factor
1:
lqc2 vf4,0x0(%1) # vf4 *source
vmulx.xyzw vf6,vf4,vf5 # vf6 *source * factor
lqc2 vf4,0x0(%0)
addi %0, 0x10
vadd.xyzw vf6,vf4,vf6 # vf6 = *dest + factor * *source
addi %2,-4
addi %1, 0x10
sqc2 vf6,-0x10(%0)
bgtz %2,1b
nop
"
: /* no output */
: "r" (target_adress), "r" (source_adress) , "r" (size), "f" (factor)
: "$8" , "memory");
#else
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
# if defined(IVP_WILLAMETTE)
;
__m128d factor128=_mm_set1_pd(factor);
for(int i=size;i>0;i-=IVP_VECFPU_SIZE) {
__m128d source_d=_mm_load_pd(source_adress);
__m128d prod_d=_mm_mul_pd(factor128,source_d);
__m128d target_d=_mm_load_pd(target_adress);
target_d=_mm_add_pd(prod_d,target_d);
_mm_store_pd(target_adress,target_d);
target_adress+=IVP_VECFPU_SIZE;
source_adress+=IVP_VECFPU_SIZE;
}
#endif
} else {
for(int i=size;i>0;i-=IVP_VECFPU_SIZE) {
# if IVP_VECFPU_SIZE == 2
IVP_DOUBLE a = source_adress[0] * factor;
IVP_DOUBLE b = source_adress[1] * factor;
a += target_adress[0];
b += target_adress[1];
target_adress[0] = a;
target_adress[1] = b;
# elif IVP_VECFPU_SIZE == 4
IVP_DOUBLE a = source_adress[0] * factor;
IVP_DOUBLE b = source_adress[1] * factor;
IVP_DOUBLE c = source_adress[2] * factor;
IVP_DOUBLE d = source_adress[3] * factor;
a += target_adress[0];
b += target_adress[1];
c += target_adress[2];
d += target_adress[3];
target_adress[0] = a;
target_adress[1] = b;
target_adress[2] = c;
target_adress[3] = d;
# else
IVP_DOUBLE a = source_adress[0] * factor;
a += target_adress[0];
target_adress[0] = a;
# endif
target_adress+=IVP_VECFPU_SIZE;
source_adress+=IVP_VECFPU_SIZE;
}
}
#endif
}
inline IVP_DOUBLE IVP_VecFPU::fpu_large_dot_product(IVP_DOUBLE *base_a, IVP_DOUBLE *base_b, int size, IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
//we have to calculate the block size and shift adresses to lower aligned adresses
long result_adress = long(base_a) & IVP_VECFPU_MEM_MASK;
base_b = (IVP_DOUBLE *)( long (base_b) & IVP_VECFPU_MEM_MASK);
size += (long(base_a)-result_adress)>>IVP_VECFPU_MEMSHIFT; // because start changed
base_a=(IVP_DOUBLE *)result_adress;
}
# if defined(IVP_WILLAMETTE)
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
__m128d sum128 =_mm_set1_pd(0.0f);
int i;
for(i=size;i>=IVP_VECFPU_SIZE; i-= IVP_VECFPU_SIZE) {
__m128d mult1=_mm_load_pd(base_a);
__m128d mult2=_mm_load_pd(base_b);
__m128d prod =_mm_mul_pd(mult1,mult2);
sum128 =_mm_add_pd(prod,sum128);
base_a += IVP_VECFPU_SIZE;
base_b += IVP_VECFPU_SIZE;
}
__m128d dummy,low,high;
low=sum128;
dummy=sum128;
//_mm_shuffle_pd(sum128,sum128,1); //swap high and low
sum128=_mm_unpackhi_pd(sum128,dummy);
high=sum128;
__m128d sum64=_mm_add_sd(low,high);
__m128d res=sum64;
//_mm_shuffle_pd(sum64,sum64,1);
IVP_DOUBLE result_sum;
_mm_store_sd(&result_sum,res);
for(;i>=0;i--) {
result_sum += base_a[i] * base_b[i];
}
return result_sum;
} else {
IVP_DOUBLE sum=0.0f;
for(int i=size-1;i>=0;i--) {
sum += base_a[i] * base_b[i];
}
return sum;
}
#else
IVP_DOUBLE sum=0.0f;
for(int i=size-1;i>=0;i--) {
sum += base_a[i] * base_b[i];
}
return sum;
#endif
}
inline void IVP_VecFPU::fpu_multiply_row(IVP_DOUBLE *target_adress,IVP_DOUBLE factor,int size,IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
//we have to calculate the block size and shift adresses to lower aligned adresses
long adress,result_adress;
adress=(long)target_adress;
result_adress=adress & IVP_VECFPU_MEM_MASK;
size+=(adress-result_adress)>>IVP_VECFPU_MEMSHIFT;
target_adress=(IVP_DOUBLE *)result_adress;
}
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
#ifdef IVP_WILLAMETTE
__m128d factor128=_mm_set1_pd(factor);
int i;
for(i=size;i>0;i-=IVP_VECFPU_SIZE) {
__m128d target_d=_mm_load_pd(target_adress);
target_d=_mm_mul_pd(factor128,target_d);
_mm_store_pd(target_adress,target_d);
target_adress+=IVP_VECFPU_SIZE;
}
#endif
} else {
int i;
for(i=size;i>0;i-=IVP_VECFPU_SIZE) {
# if IVP_VECFPU_SIZE == 2
IVP_DOUBLE a = target_adress[0] * factor;
IVP_DOUBLE b = target_adress[1] * factor;
target_adress[0] = a;
target_adress[1] = b;
# elif IVP_VECFPU_SIZE == 4
IVP_DOUBLE a = target_adress[0] * factor;
IVP_DOUBLE b = target_adress[1] * factor;
IVP_DOUBLE c = target_adress[2] * factor;
IVP_DOUBLE d = target_adress[3] * factor;
target_adress[0] = a;
target_adress[1] = b;
target_adress[2] = c;
target_adress[3] = d;
# else
IVP_DOUBLE a = target_adress[0] * factor;
target_adress[0] = a;
# endif
target_adress+=IVP_VECFPU_SIZE;
}
}
}
// #+# sparc says rui, optimize for non vector units ( IVP_VECFPU_SIZE = 4 )
inline void IVP_VecFPU::fpu_exchange_rows(IVP_DOUBLE *target_adress1,IVP_DOUBLE *target_adress2,int size,IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
//we have to calculate the block size and shift adresses to lower aligned adresses
long adress,result_adress;
adress=(long)target_adress1;
result_adress=adress & IVP_VECFPU_MEM_MASK;
size+=(adress-result_adress)>>IVP_VECFPU_MEMSHIFT;
target_adress1=(IVP_DOUBLE *)result_adress;
adress=(long)target_adress2;
adress=adress & IVP_VECFPU_MEM_MASK;
target_adress2=(IVP_DOUBLE *)adress;
}
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
#ifdef IVP_WILLAMETTE
int i;
for(i=size;i>0;i-=IVP_VECFPU_SIZE) {
__m128d a_d=_mm_load_pd(target_adress1);
__m128d b_d=_mm_load_pd(target_adress2);
_mm_store_pd(target_adress1,b_d);
_mm_store_pd(target_adress2,a_d);
target_adress1+=IVP_VECFPU_SIZE;
target_adress2+=IVP_VECFPU_SIZE;
}
#endif
} else {
int i;
for(i=size;i>0;i-=IVP_VECFPU_SIZE) {
IVP_DOUBLE h;
# if IVP_VECFPU_SIZE == 2
h=target_adress1[0]; target_adress1[0]=target_adress2[0]; target_adress2[0]=h;
h=target_adress1[1]; target_adress1[1]=target_adress2[1]; target_adress2[1]=h;
# elif IVP_VECFPU_SIZE == 4
h=target_adress1[0]; target_adress1[0]=target_adress2[0]; target_adress2[0]=h;
h=target_adress1[1]; target_adress1[1]=target_adress2[1]; target_adress2[1]=h;
h=target_adress1[2]; target_adress1[2]=target_adress2[2]; target_adress2[2]=h;
h=target_adress1[3]; target_adress1[3]=target_adress2[3]; target_adress2[3]=h;
# else
h=target_adress1[0]; target_adress1[0]=target_adress2[0]; target_adress2[0]=h;
# endif
target_adress1+=IVP_VECFPU_SIZE;
target_adress2+=IVP_VECFPU_SIZE;
}
}
}
inline void IVP_VecFPU::fpu_copy_rows(IVP_DOUBLE *target_adress,IVP_DOUBLE *source_adress,int size,IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
//we have to calculate the block size and shift adresses to lower aligned adresses
long adress,result_adress;
adress=(long)source_adress;
result_adress=adress & IVP_VECFPU_MEM_MASK;
size+=(adress-result_adress)>>IVP_VECFPU_MEMSHIFT;
source_adress=(IVP_DOUBLE *)result_adress;
adress=(long)target_adress;
adress=adress & IVP_VECFPU_MEM_MASK;
target_adress=(IVP_DOUBLE *)adress;
}
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
#ifdef IVP_WILLAMETTE
int i;
for(i=size-1;i>=0;i-=IVP_VECFPU_SIZE) {
__m128d target_d=_mm_load_pd(source_adress);
_mm_store_pd(target_adress,target_d);
target_adress+=IVP_VECFPU_SIZE;
source_adress+=IVP_VECFPU_SIZE;
}
#endif
} else {
int i;
for(i=size-1;i>=0;i-=IVP_VECFPU_SIZE) {
int j;
for(j=0;j<IVP_VECFPU_SIZE;j++) {
target_adress[j] = source_adress[j];
}
target_adress+=IVP_VECFPU_SIZE;
source_adress+=IVP_VECFPU_SIZE;
}
}
}
inline void IVP_VecFPU::fpu_set_row_to_zero(IVP_DOUBLE *target_adress,int size,IVP_BOOL adress_aligned) {
if(adress_aligned==IVP_FALSE) {
long adress,result_adress;
adress=(long)target_adress;
result_adress=adress & IVP_VECFPU_MEM_MASK;
size+=(adress-result_adress)>>IVP_VECFPU_MEMSHIFT;
target_adress=(IVP_DOUBLE *)result_adress;
}
IVP_IF_WILLAMETTE_OPT(IVP_Environment_Manager::get_environment_manager()->ivp_willamette_optimization) {
#ifdef IVP_WILLAMETTE
__m128d zero128=_mm_set1_pd(0.0f);
int i;
for(i=size-1;i>=0;i-=IVP_VECFPU_SIZE) {
_mm_store_pd(target_adress,zero128);
target_adress+=IVP_VECFPU_SIZE;
}
#endif
} else {
int i;
for(i=size-1;i>=0;i-=IVP_VECFPU_SIZE) {
int j;
for(j=0;j<IVP_VECFPU_SIZE;j++) {
target_adress[j] = 0.0f;
}
target_adress+=IVP_VECFPU_SIZE;
}
}
}
// tests a line to be greater equal result value
IVP_RETURN_TYPE IVP_Great_Matrix_Many_Zero::matrix_check_unequation_line(int linenum)
{
IVP_DOUBLE *dline = &matrix_values[aligned_row_len*linenum];
IVP_DOUBLE left_side=0.0f;
for(int i=0;i<columns;i++)
{
left_side+=result_vector[i]* *dline++;
}
IVP_DOUBLE eps = IVP_Inline_Math::fabsd(desired_vector[linenum]*IVP_STATIC_FRICTION_TEST_EPS);
if(left_side+eps >= desired_vector[linenum])
{
return IVP_OK;
} else {
//printf("gauss_unequ %f %f\n",left_side,desired_vector[linenum]);
return IVP_FAULT;
}
}
void IVP_Great_Matrix_Many_Zero::matrix_test_unequation()
{
for(int i=0;i<columns;i++)
{
IVP_DOUBLE left=0.0f;
for(int j=0;j<columns;j++){
left+=result_vector[j]*matrix_values[i*aligned_row_len+j];
}
IVP_IF(1) { printf("unequation_test left %f right %f\n",left,desired_vector[i]); }
}
}
void IVP_Great_Matrix_Many_Zero::matrix_out_before_gauss()
{
IVP_IF(1) {
for(int i=0;i<columns;i++)
{
for(int j=0;j<columns;j++)
{
printf("%.5f ",matrix_values[i*aligned_row_len+j]);
}
printf("= %.5f\n",desired_vector[i]);
}
}
}
// add factor * line number j to line number i (j=first i=second)
// the matrix-entries left of j in both lines are zero
void IVP_Great_Matrix_Many_Zero::add_multiple_line(int first,int second,IVP_DOUBLE factor)
{
IVP_DOUBLE *first_line=&matrix_values[first * this->aligned_row_len + first];
IVP_DOUBLE *second_line=&matrix_values[second * this->aligned_row_len + first];
IVP_VecFPU::fpu_add_multiple_row(second_line,first_line,factor,columns-first,IVP_FALSE);
desired_vector[second]+=desired_vector[first] * factor;
}
void IVP_Great_Matrix_Many_Zero::debug_fill_zero() {
//prevent SUN from getting read from uninitialized errors
for(int i=0;i<columns;i++) {
for(int j=0;j<aligned_row_len;j++) {
matrix_values[i*aligned_row_len+j]=0.0f;
}
}
}
IVP_RETURN_TYPE IVP_Great_Matrix_Many_Zero::solve_great_matrix_many_zero()
{
#ifdef DEBUG
IVP_IF(0) {
this->plausible_input_check();
}
#endif
transform_to_lower_null_triangle();
IVP_RETURN_TYPE rt=solve_lower_null_matrix();
return rt;
}
void IVP_Great_Matrix_Many_Zero::copy_matrix(IVP_Great_Matrix_Many_Zero *orig_mat) {
for(int i=0;i<columns;i++) {
for(int j=0;j<columns;j++) {
this->matrix_values[i*columns+j] = orig_mat->matrix_values[i*columns+j];
}
this->desired_vector[i] = orig_mat->desired_vector[i];
}
}
void IVP_Great_Matrix_Many_Zero::align_matrix_values() {
long adress,result_adress;
adress=(long)matrix_values;
int maximal_shift=(IVP_VECFPU_SIZE-1)<<IVP_VECFPU_MEMSHIFT;
result_adress=(adress+maximal_shift) & IVP_VECFPU_MEM_MASK;
matrix_values=(IVP_DOUBLE*)result_adress;
IVP_IF(1){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -