📄 edgemodel.cpp
字号:
#include "EdgeModel.h"
#include "PrmGraph.h"
#include "AnnotatedSpectrum.h"
#include "Model.h"
#include "ME_REG.h"
// parameters for all regional models (magic numbers)
float EdgeModel::weight_single_state_score;
float EdgeModel::weight_single_offset_score;
float EdgeModel::weight_multi_state_score;
float EdgeModel::weight_multi_offset_score;
float EdgeModel::weight_transfer_score;
float EdgeModel::weight_combo_score;
float EdgeModel::multi_aa_penalty;
float EdgeModel::bad_pair_penalty;
float EdgeModel::problematic_pair_penalty;
vector<float> EdgeModel::tol_limits;
int RegionalEdgeModel::calc_tol_bin_idx(const Breakage* n_break, const Breakage *c_break,
mass_t exp_edge_mass) const
{
mass_t total_offset=0;
int num_offsets=0;
int i;
for (i=0; i<strong_frag_idxs.size(); i++)
{
const int n_pos = n_break->get_position_of_frag_idx(strong_frag_idxs[i]);
const int c_pos = c_break->get_position_of_frag_idx(strong_frag_idxs[i]);
if (n_pos<0 || c_pos<0)
continue;
const mass_t n_mass = n_break->fragments[n_pos].mass;
const mass_t c_mass = c_break->fragments[c_pos].mass;
const mass_t offset = fabs(fabs((n_mass-c_mass)*strong_frag_charges[i])-exp_edge_mass);
total_offset+=offset;
num_offsets++;
}
total_offset *= one_over_tolerance;
return EdgeModel::get_tol_bin_idx((float)total_offset);
}
void RegionalEdgeModel::set_params(Config *config, int c, int s, int r)
{
charge = c;
region_idx = r;
size_idx = s;
const RegionalFragments& rf = config->get_regional_fragments(c,s,r);
strong_frag_idxs = rf.get_strong_frag_type_idxs();
num_states = 1;
strong_frag_charges.resize(strong_frag_idxs.size());
int i;
for (i=0; i<strong_frag_idxs.size(); i++)
{
strong_frag_charges[i]=config->get_fragment(strong_frag_idxs[i]).charge;
num_states *=2;
}
one_over_tolerance = 1.0/config->get_tolerance();
}
void RegionalEdgeModel::train_regional_edge_model(void *model_ptr, const FileManager& fm, const FileSet& fs)
{
Model *model = (Model *)model_ptr;
Config *config = model->get_config();
const vector<mass_t>& aa2mass = config->get_aa2mass();
const vector<int>& org_aa = config->get_org_aa();
const mass_t tolerance = config->get_tolerance();
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
const int num_limits = EdgeModel::get_num_tol_limits();
bool verbose = true;
vector<double> good_single_state_counts, bad_single_state_counts;
vector< vector< double > > good_single_off_counts, bad_single_off_counts;
vector<double> good_multi_state_counts, bad_multi_state_counts;
vector< vector< double > > good_multi_off_counts, bad_multi_off_counts;
vector< vector< double > > good_trans_counts, bad_trans_counts;
vector< vector< double > > good_aa_combo_counts, bad_aa_combo_counts;
good_single_state_counts.resize(num_states,2.0);
bad_single_state_counts.resize(num_states,2.0);
good_multi_state_counts.resize(num_states,2.0);
bad_multi_state_counts.resize(num_states,2.0);
int i;
good_single_off_counts.resize(num_states);
bad_single_off_counts.resize(num_states);
good_multi_off_counts.resize(num_states);
bad_multi_off_counts.resize(num_states);
for (i=0; i<num_states; i++)
{
good_single_off_counts[i].resize(num_limits,2.0);
bad_single_off_counts[i].resize(num_limits,2.0);
good_multi_off_counts[i].resize(num_limits,2.0);
bad_multi_off_counts[i].resize(num_limits,2.0);
}
good_trans_counts.resize(num_states);
bad_trans_counts.resize(num_states);
for (i=0; i<num_states; i++)
{
good_trans_counts[i].resize(num_states,5.0);
bad_trans_counts[i].resize(num_states,5.0);
}
good_aa_combo_counts.resize(Val+1);
bad_aa_combo_counts.resize(Val+1);
for (i=0; i<=Val; i++)
{
good_aa_combo_counts[i].resize(Val+1,1.0);
bad_aa_combo_counts[i].resize(Val+1,1.0);
}
vector<QCPeak> peaks;
peaks.resize(10000);
BasicSpecReader bsr;
if (verbose)
{
cout << endl;
cout << "Computing Edge Model for " << charge << " " << size_idx << " " << region_idx << endl << endl;
}
double num_good_combos=0;
double num_bad_combos=0;
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
SingleSpectrumFile *ssf = all_ssf[sc];
const Peptide& pep = ssf->peptide;
const vector<int> pep_aas = pep.get_amino_acids();
const mass_t pm_with_19 = pep.get_mass_with_19();
BasicSpectrum bs;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
bs.num_peaks = num_peaks;
bs.peaks = &peaks[0];
bs.ssf = ssf;
AnnotatedSpectrum as;
as.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
as.set_peptide(pep);
as.annotate_spectrum(pm_with_19,true);
PrmGraph prm;
prm.create_graph_from_spectrum(model,&as,pm_with_19,charge);
const int num_nodes = prm.get_num_nodes();
vector<int> good_node_inds;
vector<mass_t> correct_break_masses;
pep.calc_expected_breakage_masses(config,correct_break_masses);
good_node_inds.resize(num_nodes,0);
int i;
for (i=0; i<correct_break_masses.size(); i++)
{
int max_node_idx = prm.get_max_score_node(correct_break_masses[i],tolerance);
if (max_node_idx>=0)
good_node_inds[max_node_idx]=1;
max_node_idx = prm.get_max_score_node(pm_with_19 - correct_break_masses[i],tolerance);
if (max_node_idx>=0)
good_node_inds[max_node_idx]=2;
}
const vector<Node>& nodes = prm.get_nodes();
const vector<MultiEdge>& edges = prm.get_multi_edges();
// collect edge data
const vector<MultiEdge>& multi_edges = prm.get_multi_edges();
for (i=0; i<multi_edges.size(); i++)
{
const MultiEdge& edge = multi_edges[i];
if (prm.get_node(edge.n_idx).breakage.region_idx != region_idx)
continue;
// state offset data
bool good_edge=false;
if (good_node_inds[edge.n_idx] == 1 && good_node_inds[edge.c_idx] == 1)
{
good_edge=true;
}
else if (good_node_inds[edge.n_idx] == 0 && good_node_inds[edge.c_idx] == 0)
{
good_edge=false;
}
else
continue;
const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;
if (! n_break || ! c_break )
continue;
const int n_state = calc_state(n_break);
const int c_state = calc_state(c_break);
// if (n_state==0 || c_state == 0)
// continue;
const int intersec_state = this->calc_intersec_state(n_state,c_state);
if (good_edge)
{
if (edge.num_aa==1)
{
good_single_state_counts[intersec_state]++;
}
else
good_multi_state_counts[intersec_state]++;
}
else
if (edge.num_aa==1)
{
bad_single_state_counts[intersec_state]++;
}
else
bad_multi_state_counts[intersec_state]++;
int v;
for (v=0; v<edge.variant_ptrs.size(); v++)
{
int *v_ptr = edge.variant_ptrs[v];
const int num_aa = *v_ptr++;
mass_t exp_edge_mass=0;
int j;
for (j=0; j<num_aa; j++)
exp_edge_mass+=aa2mass[v_ptr[j]];
const int offset_bin = this->calc_tol_bin_idx(n_break,c_break,exp_edge_mass);
if (good_edge)
{
int j;
for (j=0; j<correct_break_masses.size(); j++)
if (fabs(correct_break_masses[j]-n_break->mass)<1.0)
break;
if (j>=correct_break_masses.size()-1)
{
continue;
}
else
if ( (num_aa == 2 && (v_ptr[0] != pep_aas[j] || v_ptr[1] != pep_aas[j+1])) ||
(num_aa == 1 && (v_ptr[0] != pep_aas[j]) ) )
continue;
if (num_aa==1)
{
good_single_off_counts[intersec_state][offset_bin]++;
}
else
good_multi_off_counts[intersec_state][offset_bin]++;
good_trans_counts[n_state][c_state]++;
}
else
{
if (num_aa==1)
{
bad_single_off_counts[intersec_state][offset_bin]++;
}
else
bad_multi_off_counts[intersec_state][offset_bin]++;
bad_trans_counts[n_state][c_state]++;
}
// fill reg samples
if (num_aa==2)
{
const int n_aa = v_ptr[0];
const int c_aa = v_ptr[1];
// make sure variant is good
if (good_edge)
{
good_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
num_good_combos++;
}
else
{
bad_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
num_bad_combos++;
}
}
}
}
if (sc>0 && sc %500 == 0)
{
cout << sc << "/" << all_ssf.size() << " ..." << endl;
}
}
// compute probs
double n_good_single_state_counts=0, n_bad_single_state_counts=0;
double n_good_multi_state_counts=0, n_bad_multi_state_counts=0;
double n_good_trans_counts=0, n_bad_trans_counts=0;
vector<double> n_good_single_off_counts,n_bad_single_off_counts;
vector<double> n_good_multi_off_counts, n_bad_multi_off_counts;
int j;
for (i=0; i<good_single_state_counts.size(); i++)
{
n_good_single_state_counts+=good_single_state_counts[i];
n_good_multi_state_counts+=good_multi_state_counts[i];
}
for (i=0; i<bad_single_state_counts.size(); i++)
{
n_bad_single_state_counts+=bad_single_state_counts[i];
n_bad_multi_state_counts+=bad_multi_state_counts[i];
}
n_good_single_off_counts.resize(num_states,0);
n_bad_single_off_counts.resize(num_states,0);
n_good_multi_off_counts.resize(num_states,0);
n_bad_multi_off_counts.resize(num_states,0);
for (i=0; i<num_states; i++)
for (j=0; j<num_limits; j++)
{
n_good_single_off_counts[i]+=good_single_off_counts[i][j];
n_bad_single_off_counts[i]+=bad_single_off_counts[i][j];
n_good_multi_off_counts[i]+=good_multi_off_counts[i][j];
n_bad_multi_off_counts[i]+=bad_multi_off_counts[i][j];
}
n_good_trans_counts=0;
n_bad_trans_counts=0;
for (i=0; i<num_states; i++)
for (j=0; j<num_states; j++)
{
n_good_trans_counts+=good_trans_counts[i][j];
n_bad_trans_counts+=bad_trans_counts[i][j];
}
log_odds_state.resize(num_states);
multi_log_odds_state.resize(num_states);
for (i=0; i<num_states; i++)
{
log_odds_state[i]=log(good_single_state_counts[i]/n_good_single_state_counts)-
log(bad_single_state_counts[i]/n_bad_single_state_counts);
multi_log_odds_state[i]=log(good_multi_state_counts[i]/n_good_multi_state_counts)-
log(bad_multi_state_counts[i]/n_bad_multi_state_counts);
}
log_odds_offset.resize(num_states);
multi_log_odds_offset.resize(num_states);
log_odds_transfer.resize(num_states);
for (i=0; i<num_states; i++)
{
log_odds_offset[i].resize(num_limits);
multi_log_odds_offset[i].resize(num_limits);
log_odds_transfer[i].resize(num_states);
int j;
for (j=0; j<num_limits; j++)
{
log_odds_offset[i][j]=log(good_single_off_counts[i][j]/n_good_single_off_counts[i])-
log(bad_single_off_counts[i][j]/n_bad_single_off_counts[i]);
if (j>0 && log_odds_offset[i][j]>log_odds_offset[i][j-1])
log_odds_offset[i][j]=log_odds_offset[i][j-1];
multi_log_odds_offset[i][j]=log(good_multi_off_counts[i][j]/n_good_multi_off_counts[i])-
log(bad_multi_off_counts[i][j]/n_bad_multi_off_counts[i]);
if (j>0 && multi_log_odds_offset[i][j]>multi_log_odds_offset[i][j-1])
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -