📄 edgemodel.cpp
字号:
// resize according to threhsolds, assume that the last mass is POS_INF
init_edge_model_defaults();
regional_edge_models.resize(size_thresholds.size());
int c;
for (c=0; c<size_thresholds.size(); c++)
if (size_thresholds[c].size()>0)
regional_edge_models[c].resize(size_thresholds[c].size());
for (c=0; c<regional_edge_models.size(); c++)
{
if (specific_charge>0 && c != specific_charge)
continue;
int s;
for (s=0; s<regional_edge_models[c].size(); s++)
{
const int num_regions = config->get_num_regions(c,s);
regional_edge_models[c][s].resize(num_regions,NULL);
// train all regions
FileSet fs;
mass_t min_mz=0;
mass_t max_mz = 10000;
if (s>0)
min_mz = (size_thresholds[c][s-1]+c-1)/(mass_t)c;
if (s< size_thresholds[c].size())
max_mz = (size_thresholds[c][s]+c-1)/(mass_t)c;
fs.select_files_in_mz_range(fm,min_mz,max_mz,c);
if (fs.get_total_spectra()<300)
{
cout << "Warning: not enough spectra for charge " << c << " size " << s << ", not training edge models!" << endl;
continue;
}
fs.randomly_reduce_ssfs(20000);
// set region frag_idxs
int r;
for (r=0; r<num_regions; r++)
{
regional_edge_models[c][s][r] = new RegionalEdgeModel;
regional_edge_models[c][s][r]->set_params(config,c,s,r);
regional_edge_models[c][s][r]->train_regional_edge_model(model,fm,fs);
}
}
}
// train double combo model
if (1)
{
FileSet fs;
fs.select_all_files(fm);
fs.randomly_reduce_ssfs(250000);
Model *model = (Model *)model_ptr;
Config *config = model->get_config();
const mass_t tolerance = config->get_tolerance();
const vector<mass_t>& aa2mass = config->get_aa2mass();
const vector<int>& org_aa = config->get_org_aa();
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
bool verbose = true;
vector< vector< double > > good_aa_combo_counts, bad_aa_combo_counts;
int i;
good_aa_combo_counts.resize(Val+1);
bad_aa_combo_counts.resize(Val+1);
for (i=0; i<=Val; i++)
{
good_aa_combo_counts[i].resize(Val+1,1.0);
bad_aa_combo_counts[i].resize(Val+1,1.0);
}
vector<QCPeak> peaks;
peaks.resize(10000);
BasicSpecReader bsr;
if (verbose)
{
cout << endl;
cout << "Computing double edge model..." << endl;
}
double num_good_combos=0;
double num_bad_combos=0;
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
SingleSpectrumFile *ssf = all_ssf[sc];
const Peptide& pep = ssf->peptide;
const vector<int> pep_aas = pep.get_amino_acids();
const mass_t pm_with_19 = pep.get_mass_with_19();
BasicSpectrum bs;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
bs.num_peaks = num_peaks;
bs.peaks = &peaks[0];
bs.ssf = ssf;
AnnotatedSpectrum as;
as.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
as.set_peptide(pep);
as.annotate_spectrum(pm_with_19,true);
PrmGraph prm;
prm.create_graph_from_spectrum(model,&as,pm_with_19,ssf->charge);
const int num_nodes = prm.get_num_nodes();
vector<int> good_node_inds;
vector<mass_t> correct_break_masses;
pep.calc_expected_breakage_masses(config,correct_break_masses);
good_node_inds.resize(num_nodes,0);
int i;
for (i=0; i<correct_break_masses.size(); i++)
{
int max_node_idx = prm.get_max_score_node(correct_break_masses[i],tolerance);
if (max_node_idx>=0)
good_node_inds[max_node_idx]=1;
max_node_idx = prm.get_max_score_node(pm_with_19 - correct_break_masses[i],tolerance);
if (max_node_idx>=0)
good_node_inds[max_node_idx]=2;
}
const vector<Node>& nodes = prm.get_nodes();
const vector<MultiEdge>& edges = prm.get_multi_edges();
// collect edge data
const vector<MultiEdge>& multi_edges = prm.get_multi_edges();
for (i=0; i<multi_edges.size(); i++)
{
const MultiEdge& edge = multi_edges[i];
if (edge.num_aa !=2)
continue;
// state offset data
bool good_edge=false;
if (good_node_inds[edge.n_idx] == 1 && good_node_inds[edge.c_idx] == 1)
{
good_edge=true;
}
else if (good_node_inds[edge.n_idx] == 0 && good_node_inds[edge.c_idx] == 0)
{
good_edge=false;
}
else
continue;
const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;
if (! n_break || ! c_break )
continue;
int v;
for (v=0; v<edge.variant_ptrs.size(); v++)
{
int *v_ptr = edge.variant_ptrs[v];
const int num_aa = *v_ptr++;
if (good_edge)
{
int j;
for (j=0; j<correct_break_masses.size(); j++)
if (fabs(correct_break_masses[j]-n_break->mass)<1.0)
break;
if (j>=correct_break_masses.size()-1)
{
continue;
}
else
if ( num_aa == 2 && (v_ptr[0] != pep_aas[j] || v_ptr[1] != pep_aas[j+1]))
continue;
}
const int n_aa = v_ptr[0];
const int c_aa = v_ptr[1];
// make sure variant is good
if (good_edge)
{
good_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
num_good_combos++;
}
else
{
bad_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
num_bad_combos++;
}
}
}
if (sc>0 && sc %1000 == 0)
{
cout << sc << "/" << all_ssf.size() << " ..." << endl;
}
}
cout << "Combo table:" << endl;
double_combo_scores.resize(Val+1);
for (i=Ala; i<=Val; i++)
{
double_combo_scores[i].resize(Val+1,0);
if (i==Ile)
continue;
int j;
for (j=Ala; j<=Val; j++)
if (j!=Ile)
double_combo_scores[i][j]=log(good_aa_combo_counts[i][j]/num_good_combos)-
log(bad_aa_combo_counts[i][j]/num_bad_combos);
}
const vector<string>& aa2label = config->get_aa2label();
cout << fixed << setprecision(1) << setw(5);
cout << " ";
for (i=Ala; i<=Val; i++)
cout << aa2label[i] << " ";
cout << endl;
for (i=Ala; i<=Val; i++)
{
cout << aa2label[i] << " ";
int j;
for (j=Ala; j<=Val; j++)
{
printf("%.1f ",double_combo_scores[i][j]);
if (double_combo_scores[i][j]>=0)
printf(" ");
}
cout << endl;
}
}
ind_was_initialized = true;
}
void EdgeModel::init_edge_model_defaults()
{
weight_single_state_score = 1.0;
weight_single_offset_score = 1.0;
weight_multi_state_score = 1.0;
weight_multi_offset_score = 1.0;
weight_transfer_score = 1.0;
weight_combo_score = 3.0;
multi_aa_penalty = -8.0;
bad_pair_penalty = -5.0;
problematic_pair_penalty = -2.0;
tol_limits.clear();
tol_limits.push_back(0.1);
tol_limits.push_back(0.25);
tol_limits.push_back(0.5);
tol_limits.push_back(0.9);
tol_limits.push_back(POS_INF);
ind_was_initialized = false;
}
// assigns probs and scores to graph edges
void EdgeModel::score_graph_edges(PrmGraph& prm) const
{
if (! this->ind_was_initialized)
return;
const vector<int>& org_aa = config->get_org_aa();
const vector<mass_t>& aa2mass = config->get_aa2mass();
const vector< vector< bool > >& allowed_double_edge = config->get_allowed_double_edge();
const vector< vector< bool > >& double_edge_with_same_mass_as_single = config->get_double_edge_with_same_mass_as_single();
const int charge = prm.charge;
const int size_idx = prm.size_idx;
const vector<Node>& nodes = prm.nodes;
const int num_edges = prm.multi_edges.size();
int i;
for (i=0; i<num_edges; i++)
{
MultiEdge& edge = prm.multi_edges[i];
const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;
const int region_idx = (n_break ? n_break->region_idx : 0);
const RegionalEdgeModel* rem = regional_edge_models[charge][size_idx][region_idx];
if (! rem)
{
cout << "Error: no edge model for " << charge << " " << size_idx << " " << region_idx << "!" << endl;
exit(1);
}
const int n_state = (n_break ? rem->calc_state(n_break) : 0);
const int c_state = (c_break ? rem->calc_state(c_break) : 0);
const int intersec_state = rem->calc_intersec_state(n_state,c_state);
const bool add_only_positive_state_scores = (edge.n_idx == 0 || edge.c_idx == nodes.size()-1 ||
nodes[edge.c_idx].type == NODE_DIGEST);
edge.max_variant_score = NEG_INF;
int v;
for (v=0; v<edge.variant_ptrs.size(); v++)
{
int *v_ptr = edge.variant_ptrs[v];
const int num_aa = *v_ptr++;
float edge_score = 0;
// this portion needs to be trained for each charge/size/region
if (rem->has_values)
{
mass_t exp_edge_mass=0;
int j;
for (j=0; j<num_aa; j++)
exp_edge_mass+=aa2mass[v_ptr[j]];
const int offset_bin = rem->calc_tol_bin_idx(n_break,c_break,exp_edge_mass);
if (num_aa == 1)
{
const float add_score = weight_single_state_score * rem->log_odds_state[intersec_state] +
weight_single_offset_score * rem->log_odds_offset[intersec_state][offset_bin];
if (! add_only_positive_state_scores || add_score>0)
edge_score += add_score;
}
else
{
edge_score += multi_aa_penalty;
if (num_aa>2)
edge_score += (num_aa-2)*0.5*multi_aa_penalty;
const float add_score = weight_multi_state_score * rem->multi_log_odds_state[intersec_state] +
weight_multi_offset_score * rem->multi_log_odds_offset[intersec_state][offset_bin];
if (! add_only_positive_state_scores || add_score>0)
edge_score += add_score;
}
edge_score += weight_transfer_score * rem->log_odds_transfer[n_state][c_state];
}
// this is in the EdgeModel, common to all charge/size/regions
if (num_aa>1)
{
const int n_aa = v_ptr[0];
const int c_aa = v_ptr[num_aa-1];
const int org_n_aa = org_aa[n_aa];
const int org_c_aa = org_aa[c_aa];
edge_score += weight_combo_score * double_combo_scores[org_n_aa][org_c_aa];
if (! allowed_double_edge[org_n_aa][org_c_aa])
edge_score += bad_pair_penalty;
if (double_edge_with_same_mass_as_single[org_n_aa][org_c_aa])
edge_score += problematic_pair_penalty;
}
edge.variant_scores[v] += edge_score;
if (edge.variant_scores[v]>edge.max_variant_score)
edge.max_variant_score = edge.variant_scores[v];
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -