📄 denovopartmodel.cpp
字号:
#include "DeNovoRankScore.h"
#include "auxfun.h"
extern const float RKH_pair_matrix[6][6];
void find_prediction_ranks(const vector<float>& scores, vector<int>& ranks)
{
vector<score_pair> pairs;
pairs.resize(scores.size());
ranks.clear();
if (scores.size()==0)
return;
pairs[0].idx=0;
pairs[0].score=NEG_INF;
int i;
for (i=1; i<scores.size(); i++)
{
pairs[i].idx=i;
pairs[i].score = scores[i];
}
sort(pairs.begin(),pairs.end());
ranks.resize(scores.size());
for (i=0; i<pairs.size(); i++)
if (pairs[i].score == NEG_INF)
{
ranks[pairs[i].idx]=POS_INF;
}
else
ranks[pairs[i].idx]=i;
/* cout << setprecision(1) << fixed;
for (i=0; i<scores.size(); i++)
cout << scores[i] << "\t";
cout << endl;
for (i=0; i<scores.size(); i++)
cout << ranks[i] << "\t";
cout << endl << endl;*/
}
struct inten_pair {
inten_pair() : idx(int(NEG_INF)), inten(NEG_INF) {};
inten_pair(int _i, intensity_t _n) : idx(_i), inten(_n) {};
bool operator< (const inten_pair& other) const
{
return inten>other.inten;
}
int idx;
intensity_t inten;
};
void find_inten_ranks(const vector<intensity_t>& intens, vector<int>& ranks)
{
vector<inten_pair> pairs;
pairs.resize(intens.size());
ranks.clear();
if (intens.size()==0)
return;
pairs[0].idx=0;
pairs[0].inten=NEG_INF;
int i;
for (i=1; i<intens.size(); i++)
{
pairs[i].idx=i;
pairs[i].inten = intens[i];
}
sort(pairs.begin(),pairs.end());
ranks.resize(intens.size());
for (i=0; i<intens.size(); i++)
if (pairs[i].inten <= 0)
{
ranks[pairs[i].idx]=POS_INF;
}
else
ranks[pairs[i].idx]=i;
}
struct rank_pair {
bool operator< ( const rank_pair& other) const
{
return org_rank<other.org_rank;
}
int idx;
int org_rank;
};
void convert_to_relative_ranks(const vector<int>& org_ranks, vector<int>& rel_ranks)
{
const int num_ranks = org_ranks.size();
vector<rank_pair> pairs;
pairs.resize(num_ranks);
if (num_ranks==0)
return;
int i;
for (i=0; i<num_ranks; i++)
{
pairs[i].idx=i;
pairs[i].org_rank = org_ranks[i];
}
sort(pairs.begin(),pairs.end());
rel_ranks.resize(num_ranks);
for (i=0; i<num_ranks; i++)
if (pairs[i].org_rank >= 999)
{
rel_ranks[pairs[i].idx]=POS_INF;
}
else
rel_ranks[pairs[i].idx]=i;
}
/***********************************************************************
dimensions of intens are (#all frags X #breakages (== peptide length +1)
************************************************************************/
void DeNovoPartitionModel::fill_combined_peak_prediction_features(
const PeptideSolution& sol,
const vector< vector<intensity_t> >& intens,
const PeakRankModel *peak_model,
RankBoostSample& rbs,
int specific_size) const
{
const int num_ranks_to_consider = 46;
int f_idx = combined_ppp_start_idx;
PeptidePeakPrediction ppp;
peak_model->calc_peptide_predicted_scores(sol, ppp, specific_size);
// the ppp includes a table of rank scores (rows are actual frag idxs, not relative
// position in the frag_type_idxs).
// reduce intensities to the same dimensionality
const int num_frags = ppp.frag_idxs.size();
vector< vector< float> > observed_intens;
observed_intens.resize(num_frags);
int i,f;
for (f=0; f<num_frags; f++)
{
const int frag_idx = ppp.frag_idxs[f];
observed_intens[f]=intens[frag_idx];
}
// calculate the ranks and mapping between predicted and observed
vector< vector<int> > observed_ranks, predicted_ranks;
calc_combined_peak_ranks(observed_intens, observed_ranks);
calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);
vector<int> pred2obs, obs2pred;
vector<int> num_obs_for_frag, num_pred_for_frag;
vector<float> ordered_scores, // scores sorted according to their value
obs_ordered_scores; // scores sorted according to the observed intensity rank
pred2obs.resize(num_ranks_to_consider,999); // look at top 50 peaks
obs2pred.resize(num_ranks_to_consider,999);
ordered_scores.resize(num_ranks_to_consider,NEG_INF);
obs_ordered_scores.resize(num_ranks_to_consider,NEG_INF);
num_obs_for_frag.resize(num_frags,0);
num_pred_for_frag.resize(num_frags,0);
for (f=0; f<num_frags; f++)
{
if (observed_ranks[f].size() != predicted_ranks[f].size())
{
cout << "#obs frags: " << observed_ranks.size() << endl;
cout << "#pred frags: " << predicted_ranks.size() << endl;
cout << "Error: mismatch in rank dimensionalities!" << endl;
cout << f << "\tobs : " << observed_ranks[f].size() << " pred " << predicted_ranks[f].size() << endl;
exit(1);
}
const int num_ranks = predicted_ranks[f].size();
const vector<float>& frag_rank_scores = ppp.rank_scores[f];
const vector<float>& frag_intens = observed_intens[f];
int i;
for (i=0; i<num_ranks; i++)
{
const int obs_rank = observed_ranks[f][i];
const int pred_rank = predicted_ranks[f][i];
const float pred_score = frag_rank_scores[i];
if (pred_rank<num_ranks_to_consider)
{
pred2obs[pred_rank]=obs_rank;
ordered_scores[pred_rank]=pred_score;
}
if (obs_rank<num_ranks_to_consider)
{
obs2pred[obs_rank]=pred_rank;
obs_ordered_scores[obs_rank]=pred_score;
}
if (frag_intens[i]>0)
num_obs_for_frag[f]++;
if (frag_rank_scores[i]>NEG_INF)
num_pred_for_frag[f]++;
}
}
vector<int> missed_ranks;
for (i=0; i<num_ranks_to_consider; i++)
if (pred2obs[i]>900)
missed_ranks.push_back(i);
const int mobility = sol.calc_mobility();
rbs.add_real_feature(f_idx++,(float)mobility);
// fill per frag features
for (f=0; f<num_frags && f<3; f++)
{
vector<int> rel_obs_ranks, rel_pred_ranks;
convert_to_relative_ranks(observed_ranks[f], rel_obs_ranks);
convert_to_relative_ranks(predicted_ranks[f],rel_pred_ranks);
if (num_pred_for_frag[f]>0)
{
rbs.add_real_feature(f_idx+f,num_obs_for_frag[f]/(float)num_pred_for_frag[f]);
}
}
f_idx+=3;
//sums of missed ranks
rbs.add_real_feature(f_idx++,(float)missed_ranks.size());
int sum_missed=NEG_INF;
if (missed_ranks.size()>4)
{
sum_missed = missed_ranks[0]+missed_ranks[1]+missed_ranks[2]+missed_ranks[3]+missed_ranks[4];
rbs.add_real_feature(f_idx+(mobility-1),sum_missed);
rbs.add_real_feature(f_idx+3,sum_missed);
}
f_idx+=4;
if (missed_ranks.size()>9)
{
sum_missed=missed_ranks[5]+missed_ranks[6]+missed_ranks[7]+missed_ranks[8]+missed_ranks[9];
rbs.add_real_feature(f_idx+(mobility-1),sum_missed);
rbs.add_real_feature(f_idx+3,sum_missed);
}
f_idx+=4;
if (missed_ranks.size()>14)
{
sum_missed=missed_ranks[10]+missed_ranks[11]+missed_ranks[12]+missed_ranks[13]+missed_ranks[14];
rbs.add_real_feature(f_idx+(mobility-1),sum_missed);
rbs.add_real_feature(f_idx+3,sum_missed);
}
f_idx+=4;
for (i=0; i<7; i++)
rbs.add_real_feature(f_idx++,pred2obs[i]);
for (i=0; i<7; i++)
rbs.add_real_feature(f_idx++,obs2pred[i]);
for (i=0; i<9&& i*2<missed_ranks.size(); i++)
rbs.add_real_feature(f_idx+i,missed_ranks[2*i]);
f_idx+=9;
for (i=0; i<7; i++)
if (ordered_scores[i]>NEG_INF && obs_ordered_scores[i]>NEG_INF)
rbs.add_real_feature(f_idx+i,ordered_scores[i]-obs_ordered_scores[i]);
f_idx+=7;
// calc dot prod score feature
// normalize ranks according to f(x)=1/(1+x)
static vector<float> one_over_rank, one_over_rank_sqr;
if (one_over_rank.size()<1000)
{
one_over_rank.resize(1000);
one_over_rank_sqr.resize(1000);
int i;
for (i=0; i<num_ranks_to_consider; i++)
{
one_over_rank[i]=(i/(1.0+(float)i));
one_over_rank_sqr[i]=one_over_rank[i]*one_over_rank[i];
}
}
float top_a=0, top_b=0;
float bottom_a1=1E-8,bottom_a2=1E-8, bottom_b1=1E-8, bottom_b2=1E-8;
int round_idx=0;
for (i=0; i<num_ranks_to_consider && i<45; i++)
{
const int obs_rank=(pred2obs[i]>999 ? 999 : pred2obs[i]);
const int pred_rank = (obs2pred[i]>999 ? 999 : obs2pred[i]);
top_a += (one_over_rank[i]*one_over_rank[obs_rank]);
bottom_a1 += one_over_rank_sqr[i];
bottom_a2 += one_over_rank_sqr[obs_rank];
top_b += (one_over_rank[i]*one_over_rank[pred_rank]);
bottom_b1 += one_over_rank_sqr[i];
bottom_b2 += one_over_rank_sqr[pred_rank];
if (i>0 && (i+1) % 15 == 0)
{
rbs.add_real_feature(f_idx+2*round_idx,top_a/sqrt(bottom_a1*bottom_a2));
rbs.add_real_feature(f_idx+2*round_idx+1,top_b/sqrt(bottom_b1*bottom_b2));
round_idx++;
if (round_idx==3)
break;
}
}
f_idx+=6;
}
void DeNovoPartitionModel::fill_peak_prediction_features(
const PeptideSolution& sol,
const vector< vector<intensity_t> >& intens,
const PeakRankModel *peak_model,
RankBoostSample& rbs,
int specific_size) const
{
int f_idx = ppp_start_idx;
vector< vector<int> > ppp_prediction_ranks;
if (num_ppp_frags>1)
ppp_prediction_ranks.resize(num_ppp_frags);
PeptidePeakPrediction ppp;
peak_model->calc_peptide_predicted_scores(sol, ppp, specific_size, &ppp_frag_type_idxs);
int f;
for (f=0; f<num_ppp_frags; f++)
{
const int frag_idx = ppp_frag_type_idxs[f];
const vector<intensity_t>& frag_intens = intens[frag_idx];
const vector<float>& predicted_scores = ppp.rank_scores[frag_idx];
int i;
int num_obs_frags=0;
for (i=1; i<frag_intens.size(); i++)
if (frag_intens[i]>0)
num_obs_frags++;
int num_predicted=0;
for (i=1; i<predicted_scores.size(); i++)
if (predicted_scores[i]>NEG_INF)
num_predicted++;
///
rbs.add_real_feature(f_idx++,(float)num_obs_frags);
rbs.add_real_feature(f_idx++,(float)num_predicted);
if (num_predicted>0)
rbs.add_real_feature(f_idx,(float)num_obs_frags/(float)num_predicted);
f_idx++;
// for (i=0; i<frag_intens.size(); i++)
// cout << fixed << setprecision(1) << frag_intens[i] << "\t";
// cout << endl;
vector<int> prediction_ranks, inten_ranks;
find_prediction_ranks(predicted_scores,prediction_ranks);
find_inten_ranks(frag_intens, inten_ranks);
if (num_ppp_frags>1)
ppp_prediction_ranks[f]=prediction_ranks;
vector<int> max_ranks;
max_ranks.push_back(1);
max_ranks.push_back(3);
max_ranks.push_back(5);
max_ranks.push_back(7);
max_ranks.push_back((int)(0.1666*num_predicted));
max_ranks.push_back((int)(0.3333*num_predicted));
max_ranks.push_back((int)(0.5000*num_predicted));
max_ranks.push_back((int)(0.6667*num_predicted));
vector<int> counts;
counts.resize(max_ranks.size(),0);
for (i=0; i<predicted_scores.size(); i++)
if (predicted_scores[i]>NEG_INF)
{
int j;
for (j=0; j<counts.size(); j++)
if (prediction_ranks[i]>=0 &&
prediction_ranks[i]<max_ranks[j] &&
inten_ranks[i]<max_ranks[j])
counts[j]++;
}
for (i=0; i<4; i++)
{
if (max_ranks[i]>0 && num_obs_frags>=max_ranks[i])
rbs.add_real_feature(f_idx,(float)counts[i]);
f_idx++;
}
for ( ; i<counts.size(); i++)
{
if (max_ranks[i]>0 && num_obs_frags>=max_ranks[i])
rbs.add_real_feature(f_idx,(float)counts[i]/(float)max_ranks[i]);
f_idx++;
}
// find the prediction ranks for the hi
vector<int> missing_ranks;
missing_ranks.resize(3,prediction_ranks.size());
int num_miss=0;
int pred_rank;
for (pred_rank=0; pred_rank<prediction_ranks.size(); pred_rank++)
{
int cut_idx;
for (cut_idx=1; cut_idx<prediction_ranks.size(); cut_idx++)
if (prediction_ranks[cut_idx]==pred_rank)
break;
if (cut_idx==prediction_ranks.size())
break;
if (frag_intens[cut_idx]<=0)
missing_ranks[num_miss++]=pred_rank;
if (num_miss>=3)
break;
}
rbs.add_real_feature(f_idx++,(float)missing_ranks[0]);
rbs.add_real_feature(f_idx++,(float)missing_ranks[1]);
rbs.add_real_feature(f_idx++,(float)missing_ranks[2]);
rbs.add_real_feature(f_idx++,(float)(missing_ranks[0]+missing_ranks[1]));
rbs.add_real_feature(f_idx++,(float)(missing_ranks[0]+missing_ranks[1]+missing_ranks[2]));
// calculate the prediction score offsets
// (equals ths sum of scores for peaks that are between the peak's true
// position and the peaks predicted position)
vector<float> sorted_predicted_scores = predicted_scores;
sort(sorted_predicted_scores.begin(),sorted_predicted_scores.end());
const int max_idx = sorted_predicted_scores.size()-1;
const int mid = sorted_predicted_scores.size()/2;
for (i=0; i<mid; i++)
{
float t=sorted_predicted_scores[i];
sorted_predicted_scores[i]=sorted_predicted_scores[max_idx-i];
sorted_predicted_scores[max_idx-i]=t;
}
vector<float> score_offsets;
score_offsets.resize(10,NEG_INF);
for (pred_rank=0; pred_rank<10; pred_rank++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -