📄 denovopartmodel.cpp
字号:
int i;
for (i=0; i<amino_acids.size(); i++)
if (amino_acids[i] == m16_aa_idx)
{
float total_inten=0;
int f;
for (f=0; f<ppp_fragments.size(); f++)
{
if (ppp_fragments[f].orientation == PREFIX)
{
const float& inten = intens[ppp_frag_type_idxs[f]][i+1];
if (inten>0)
total_inten+= inten;
}
else
{
const float& inten = intens[ppp_frag_type_idxs[f]][i];
if (inten>0)
total_inten+= inten;
}
}
pairs.push_back(score_pair(i,total_inten));
}
sort(pairs.begin(),pairs.end());
vector<mass_t> break_masses;
sol.pep.calc_expected_breakage_masses(config,break_masses);
if (pairs.size()>0)
{
const int num_aa = amino_acids.size();
for (i=0; i<pairs.size() && i<MAX_NUM_M16s; i++)
{
const int aa_idx=pairs[i].idx;
if (intens[b_frag_idx].size()>aa_idx+1)
{
const float b_inten = intens[b_frag_idx][aa_idx+1];
const int pre_minus_63_idx = as.get_max_inten_peak(break_masses[aa_idx+1]-63.0,tolerance);
float ratio_pre_minus_63=-10.0;
if (b_inten == 0)
{
if (pre_minus_63_idx>=0)
ratio_pre_minus_63 = 11.0;
}
else
{
ratio_pre_minus_63 = ( pre_minus_63_idx<0 ? 0 : as.get_peak_intensity(pre_minus_63_idx)/b_inten);
if (ratio_pre_minus_63>10.0)
ratio_pre_minus_63=10.0;
}
rbs.add_real_feature(f_idx+i*2,ratio_pre_minus_63);
}
if (intens[y_frag_idx].size()>aa_idx)
{
const mass_t pep_mass = sol.pep.get_mass();
const float y_inten = intens[y_frag_idx][aa_idx];
const int suf_minus_45_idx = as.get_max_inten_peak((pep_mass - break_masses[aa_idx])-45.0,tolerance);
float ratio_suf_minus_45=-10;
if (y_inten == 0)
{
if (suf_minus_45_idx>=0)
ratio_suf_minus_45 = 11.0;
}
else
{
ratio_suf_minus_45 = ( suf_minus_45_idx<0 ? 0 : as.get_peak_intensity(suf_minus_45_idx)/y_inten);
if (ratio_suf_minus_45>10.0)
ratio_suf_minus_45=10.0;
}
rbs.add_real_feature(f_idx+i*2+1,ratio_suf_minus_45);
}
}
}
}
f_idx += 4;
}
void DeNovoRankScorer::init_tables(bool silent_ind)
{
PeakRankModel *& peak_model = peak_prediction_models[model_type];
if (! model || ! model->get_ind_pmcsqs_was_intialized())
{
cout << "Error: must first initialize the fragment model!" << endl;
exit(1);
}
if (! peak_model || peak_model->get_size_thresholds().size()<=0)
{
cout << "Error: must first initialize the peak model!" << endl;
exit(1);
}
const vector< vector< mass_t> >& size_thresholds = peak_model->get_size_thresholds();
this->dnv_part_models.resize(size_thresholds.size());
int i;
if (! silent_ind)
cout << "Init tables:" << endl;
for (i=1; i<dnv_part_models.size(); i++)
{
dnv_part_models[i].resize(size_thresholds[i].size()+1,NULL);
if (! silent_ind)
cout << i << "\t" << size_thresholds[i].size()+1 << endl;
}
}
void DeNovoPartitionModel::init_features(int model_type, int _charge, int _size_idx,
const vector<int>& ppp_frags, Config *config)
{
charge = _charge;
size_idx = _size_idx;
ScalingFactor def_scale;
def_scale.max_pm_with_19 = POS_INF;
def_scale.score_shift = 0;
def_scale.score_scale = 1.0;
scaling_factors.clear();
scaling_factors.push_back(def_scale);
feature_names.clear();
ind_was_initialized = true;
use_PTM_peak_features = true;
use_tryp_terminal_features = true;
use_ann_peak_features = true;
use_inten_balance_features = (model_type != 3);
use_peak_offset_features = true;
use_comp_features = true;
use_pmc_features = (model_type != 3);
use_prm_features = true;
use_ppp_features = false;
use_combined_ppp_features = true;
if (use_combined_ppp_features && use_ppp_features)
{
cout << "Error: must choose combine or regular ppp features, not both!" << endl;
exit(1);
}
num_ppp_frags=ppp_frags.size();
ppp_frag_type_idxs=ppp_frags;
ppp_fragments.resize(ppp_frag_type_idxs.size());
int f;
for (f=0; f<ppp_frag_type_idxs.size(); f++)
ppp_fragments[f] = config->get_fragment(ppp_frag_type_idxs[f]);
// features for special PTM peaks (like the ones for M+16)
PTM_peak_start_idx = feature_names.size();
if (use_PTM_peak_features)
{
feature_names.push_back("M+16 first p-63 ratio");
feature_names.push_back("M+16 first s-45 ratio");
feature_names.push_back("M+16 second p-63 ratio");
feature_names.push_back("M+16 second s-45 ratio");
}
// tryptic terminal features
tryp_terminal_start_idx = feature_names.size();
if (use_tryp_terminal_features)
{
feature_names.push_back("TRYP #num good tryp terminals");
feature_names.push_back("TRYP #num missed tryp terminals");
feature_names.push_back("TRYP C-term AA");
feature_names.push_back("TRYP #frags at digest when C-term is R");
feature_names.push_back("TRYP #frags at digest when C-term is K");
feature_names.push_back("TRYP #frags at digest when C-term is other");
feature_names.push_back("TRYP AA at N-terminal When C-term is R");
feature_names.push_back("TRYP AA at N-terminal When C-term is K");
feature_names.push_back("TRYP AA at N-terminal When C-term is other");
}
// Ann peak features
ann_peak_start_idx = feature_names.size();
if (use_ann_peak_features)
{
feature_names.push_back("ANN PEAK diff from org pm_with_19");
feature_names.push_back("ANN PEAK # aas in peptide");
feature_names.push_back("ANN PEAK %ann intensity");
feature_names.push_back("ANN PEAK %ann peaks");
feature_names.push_back("ANN PEAK #ann in top 25");
feature_names.push_back("ANN PEAK #ann in top half (up to 50)");
feature_names.push_back("ANN PEAK #ann in top third - #ann in mid third");
feature_names.push_back("ANN PEAK #ann in top third - #ann in last third");
feature_names.push_back("ANN PEAK #ann in mid third - #ann in last third");
const vector<FragmentType>& all_fragments = config->get_all_fragments();
int f;
for (f=0; f<all_fragments.size() && f<7; f++)
{
const string frag_label = all_fragments[f].label;
feature_names.push_back( "ANN PEAK #" + frag_label + " annotated");
}
}
// inten balance features
inten_balance_start_idx = feature_names.size();
if (use_inten_balance_features)
{
feature_names.push_back("INTEN BAL c_idx - n_idx");
feature_names.push_back("INTEN BAL RHK N");
feature_names.push_back("INTEN BAL RHK C");
feature_names.push_back("INTEN BAL RHK pair");
feature_names.push_back("INTEN BAL prefix prop, pair -4,-5");
feature_names.push_back("INTEN BAL prefix prop, pair -2,-3");
feature_names.push_back("INTEN BAL prefix prop, pair -1,0,+1");
feature_names.push_back("INTEN BAL prefix prop, pair +2,+3");
feature_names.push_back("INTEN BAL prefix prop, pair +4,+5");
feature_names.push_back("INTEN BAL all prefix prop, pair -4,-5");
feature_names.push_back("INTEN BAL all prefix prop, pair -2,-3");
feature_names.push_back("INTEN BAL all prefix prop, pair -1,0,+1");
feature_names.push_back("INTEN BAL all prefix prop, pair +2,+3");
feature_names.push_back("INTEN BAL all prefix prop, pair +4,+5");
}
// Peak offset features
peak_offset_start_idx=feature_names.size();
if (use_peak_offset_features)
{
int f;
for (f=0; f<ppp_frag_type_idxs.size() && f<2; f++)
{
const int frag_idx = ppp_frag_type_idxs[f];
const string frag_label = config->get_fragment(frag_idx).label;
const string prefix = "PEAK OFF " + frag_label + " ";
feature_names.push_back(prefix+"num frags detected");
feature_names.push_back(prefix+"max self offset");
feature_names.push_back(prefix+"avg self offset");
feature_names.push_back(prefix+"max consecutive offset");
feature_names.push_back(prefix+"avg consecutive offset");
// peak grab feature
feature_names.push_back(prefix+"grab offset #1");
feature_names.push_back(prefix+"grab offset #2");
feature_names.push_back(prefix+"grab offset #3");
}
if (f<2)
{
int i;
for (i=0; i<8; i++)
feature_names.push_back("PEAK OFF dummy");
}
}
// Peptide composition features
comp_start_idx=feature_names.size();
if (use_comp_features)
{
feature_names.push_back("PEP COMP start cat N (len 3)");
feature_names.push_back("PEP COMP end cat C (len 3)");
feature_names.push_back("PEP COMP len 3 # cat 19-20");
feature_names.push_back("PEP COMP len 3 # cat 15-18");
feature_names.push_back("PEP COMP len 3 # cat 7-14");
feature_names.push_back("PEP COMP len 3 # cat 3-6");
feature_names.push_back("PEP COMP len 3 # cat 1-2");
feature_names.push_back("PEP COMP min cat, len 3");
feature_names.push_back("PEP COMP avg cat, len 3");
feature_names.push_back("PEP COMP before cat score 1");
feature_names.push_back("PEP COMP after cat score 1");
feature_names.push_back("PEP COMP span cat score 1");
feature_names.push_back("PEP COMP before cat score 2");
feature_names.push_back("PEP COMP after cat score 2");
feature_names.push_back("PEP COMP span cat score 2");
feature_names.push_back("PEP COMP before cat score 3");
feature_names.push_back("PEP COMP after cat score 3");
feature_names.push_back("PEP COMP span cat score 3");
feature_names.push_back("PEP COMP before cat score 4");
feature_names.push_back("PEP COMP after cat score 4");
feature_names.push_back("PEP COMP span cat score 4");
const vector<string>& aa2label = config->get_aa2label();
int a;
for (a=Ala; a<=Val; a++)
{
if (a==Ile)
continue;
feature_names.push_back("PEP COMP #aa " + aa2label[a]);
}
feature_names.push_back("PEP COMP #problematic double combos");
feature_names.push_back("PEP COMP #double combo=W");
feature_names.push_back("PEP COMP #double combo=Q");
feature_names.push_back("PEP COMP #double combo=N");
feature_names.push_back("PEP COMP #double problematic combos with XG");
}
// PMCSQS features
pmc_start_idx=feature_names.size();
if (use_pmc_features)
{
feature_names.push_back("PMCSQS sqs prob for peptide charge");
feature_names.push_back("PMCSQS prob for peptide charge");
feature_names.push_back("PMCSQS mass diff from pm1, prob>0.95");
feature_names.push_back("PMCSQS mass diff from pm1, prob<=0.95");
feature_names.push_back("PMCSQS score1 for peptide charge");
feature_names.push_back("PMCSQS score2 for peptide charge");
feature_names.push_back("PMCSQS mass diff from pm2");
feature_names.push_back("PMCSQS max prob for other charges");
feature_names.push_back("PMCSQS score diff from max score with this charge, prob>=0.95");
feature_names.push_back("PMCSQS score diff from max score with this charge, 0.95>prob>=0.7");
feature_names.push_back("PMCSQS score diff from max score with this charge, prob<0.7");
}
prm_start_idx = feature_names.size();
if (use_prm_features) // use these feature only with de novo
{
const string term_combo[4]={"N/C","N/-C","-N/C","-N/-C"};
int i;
for (i=0; i<4; i++)
{
if (i>0 && (model_type != 1 && model_type != 3))
break;
feature_names.push_back("PRM " + term_combo[i] + " delta mass");
feature_names.push_back("PRM " + term_combo[i] + " total breakage score");
feature_names.push_back("PRM " + term_combo[i] + " average breakage score");
feature_names.push_back("PRM " + term_combo[i] + " normalized average breakage score");
feature_names.push_back("PRM " + term_combo[i] + " path score");
feature_names.push_back("PRM " + term_combo[i] + " average path score");
}
// adjust to random prob in spectrum (so model can be used with FT)
if (model_type == 1 || model_type == 3)
{
feature_names.push_back("PRM path score");
feature_names.push_back("PRM total breakage score");
feature_names.push_back("PRM SeqPath rank");
feature_names.push_back("PRM multipath score");
feature_names.push_back("PRM delta score");
feature_names.push_back("PRM rank, delta score<=1.5");
feature_names.push_back("PRM rank, 1.5<delta score<=7.5");
feature_names.push_back("PRM rank, 7.5<delta score<=15");
feature_names.push_back("PRM rank, delta score>15");
if (model_type == 3)
{
feature_names.push_back("PRM tag, percent in top 5 denovo");
feature_names.push_back("PRM tag, percent in top 20 denovo");
feature_names.push_back("PRM tag, percent in all denovo");
feature_names.push_back("PRM tag, rank if in top 5");
feature_names.push_back("PRM tag, rank if in top 5-20");
feature_names.push_back("PRM tag, rank if in top 20-all");
feature_names.push_back("PRM tag, highest full denovo rank");
}
}
feature_names.push_back("PRM delta num breakage scores (missing)");
// feature_names.push_back("PRM num missing edges");
feature_names.push_back("PRM num forbidden node pairs");
feature_names.push_back("PRM num breakage scores");
feature_names.push_back("PRM breakage score min 1");
feature_names.push_back("PRM breakage score min 2");
feature_names.push_back("PRM breakage score min 3");
feature_names.push_back("PRM breakage score min consecutive 3");
feature_names.push_back("PRM breakage score max consecutive 3");
feature_names.push_back("PRM breakage score min consecutive 2");
feature_names.push_back("PRM breakage score max consecutive 2");
feature_names.push_back("PRM #breakage scores below -10");
feature_names.push_back("PRM #breakage scores 0 - -10");
feature_names.push_back("PRM #breakage scores 0 - 8");
feature_names.push_back("PRM #breakage scores 8 - 15");
feature_names.push_back("PRM #breakage scores above 15");
feature_names.push_back("PRM %breakage scores below -10");
feature_names.push_back("PRM %breakage scores below 0");
feature_names.push_back("PRM %breakage scores above 0");
feature_names.push_back("PRM %breakage scores above 8");
feature_names.push_back("PRM Score connected to N-terminal");
feature_names.push_back("PRM Score connected to C-terminal");
feature_names.push_back("PRM %breakages with 1 frag detected");
feature_names.push_back("PRM %breakages with 2 frag detected");
feature_names.push_back("PRM %breakages with > 5 frags detected");
feature_names.push_back("PRM %breakages with dual orientation frags");
feature_names.push_back("PRM #orientation switches");
}
// Peak prediction features
ppp_start_idx=feature_names.size();
if (use_ppp_features)
{
int f;
for (f=0; f<ppp_frag_type_idxs.size(); f++)
{
const int frag_idx = ppp_frag_type_idxs[f];
const string frag_label = config->get_fragment(frag_idx).label;
const string prefix = "PPP " + frag_label + " ";
feature_names.push_back(prefix+"# observed frags");
feature_names.push_back(prefix+"# predicted frags");
feature_names.push_back(prefix+"observation ratio");
feature_names.push_back(prefix+"# observed frags in top 1 predicted");
feature_names.push_back(prefix+"# observed frags in top 3 predicted");
feature_names.push_back(prefix+"# observed frags in top 5 predicted");
feature_names.push_back(prefix+"# observed frags in top 7 predicted");
feature_names.push_back(prefix+"% observed frags in top 1/6 predicted");
feature_names.push_back(prefix+"% observed frags in top 1/3 predicted");
feature_names.push_back(prefix+"% observed frags in top 1/2 predicted");
feature_names.push_back(prefix+"% observed frags in top 2/3 predicted");
feature_names.push_back(prefix+" predicted rank of first missing peak");
feature_names.push_back(prefix+" predicted rank of second missing peak");
feature_names.push_back(prefix+" predicted rank of third missing peak");
feature_names.push_back(prefix+" predicted rank of first+second missing peak");
feature_names.push_back(prefix+" predicted rank of first+second+third missing peak");
feature_names.push_back(prefix+"score offset of rank 1");
feature_names.push_back(prefix+"score offset of rank 2");
feature_names.push_back(prefix+"score offset of rank 3");
feature_names.push_back(prefix+"score offset of rank 4");
feature_names.push_back(prefix+"score offset of rank 5");
feature_names.push_back(prefix+"score offset of rank 6");
feature_names.push_back(prefix+"score offset of rank 7");
feature_names.push_back(prefix+"score offset of rank 8");
feature_names.push_back(prefix+"score offset of rank 9");
feature_names.push_back(prefix+"score offset of rank 10");
}
feature_names.push_back("PPP #comp pairs");
feature_names.push_back("PPP stat of predicted pair #1");
feature_names.push_back("PPP stat of predicted pair #2");
feature_names.push_back("PPP stat of predicted pair #3");
feature_names.push_back("PPP stat of predicted pair #4");
feature_names.push_back("PPP stat of predicted pair #5");
feature_names.push_back("PPP stat of predicted pair #6");
feature_names.push_back("PPP stat of predicted pair #7");
}
combined_ppp_start_idx = feature_names.size();
if (use_combined_ppp_features)
{
feature_names.push_back("COMB PPP mobility");
feature_names.push_back("COMP PPP frag 1 obs_ratio");
feature_names.push_back("COMP PPP frag 2 obs_ratio");
feature_names.push_back("COMP PPP frag 3 obs_ratio");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -