📄 peakrankmodel.cpp
字号:
cout << "Error: session AAs don't include: " << org_aas[i] << endl;
exit(1);
}
model_aas[i]=session_aas_to_model_aas[org_aas[i]];
if (model_aas[i]<0)
{
cout << "Error: couldn't find a rank model conversion for amino acid " << org_aas[i] << endl;
exit(1);
}
}
}
void PeakRankModel::set_binary_feature_names()
{
const int num_aas = this->get_num_model_aas();
const string dis_labels[]={"-3","-2","-1","+1","+2","+3"};
const int num_dis_labels = 6;
int i;
// cout << "Initialzing binary features for " << num_aas << " amino acids " << endl
// for (i=0; i<num_aas; i++)
// cout << i << "\t" << model_aa_labels[i] << endl;
this->binary_feature_names.clear();
for (i=0; i<num_dis_labels; i++)
{
const string dis_label = dis_labels[i];
int aa;
for (aa =0; aa<num_aas; aa++)
{
string fname = "IND CUT HAS " + model_aa_labels[aa] + " AT " + dis_label;
// binary_feature_names.push_back(fname);
}
}
/* for (i=0; i<num_aas; i++)
{
int j;
for (j=0; j<num_aas; j++)
{
string fname = "IND CUT IS BETWEEN " + model_aa_labels[i] + " - " + model_aa_labels[j];
binary_feature_names.push_back(fname);
}
} */
binary_feature_names.push_back("IND CUT AT N+1");
binary_feature_names.push_back("IND CUT AT C-1");
/* binary_feature_names.push_back("IND CUT AT N+1");
binary_feature_names.push_back("IND CUT AT N+2");
binary_feature_names.push_back("IND CUT AT N+3");
binary_feature_names.push_back("IND CUT AT N+4");
binary_feature_names.push_back("IND CUT AT N+5");
binary_feature_names.push_back("IND CUT AT C-1");
binary_feature_names.push_back("IND CUT AT C-2");
binary_feature_names.push_back("IND CUT AT C-3");
binary_feature_names.push_back("IND CUT AT C-4");
binary_feature_names.push_back("IND CUT AT C-5");*/
for (i=0; i<num_aas; i++)
{
string fname = "IND CUT AT +1, N AA IS " + model_aa_labels[i];
binary_feature_names.push_back(fname);
}
for (i=0; i<num_aas; i++)
{
string fname = "IND CUT AT -1, C AA IS " + model_aa_labels[i];
binary_feature_names.push_back(fname);
}
/* binary_feature_names.push_back("IND CUT LESS THAN MID PEPTIDE");
binary_feature_names.push_back("IND CUT MORE THAN MID PEPTIDE");
binary_feature_names.push_back("IND CUT OVER CHARGE LESS THAN MID OBS");
binary_feature_names.push_back("IND CUT OVER CHARGE MORE THAN MID OBS");*/
}
void PeakRankModel::set_real_feature_names()
{
const int num_aas = this->get_num_model_aas();
int i;
real_feature_names.clear();
const string prefixes[]={"BEFORE MID, N SIDE RKH ","AFTER MID, N SIDE RKH ",
"BEFORE MID, C SIDE RKH ","AFTER MID, C SIDE RKH "};
int c;
for (c=0 ; c< num_RKH_combos; c++)
real_feature_names.push_back("DIS MIN, N SIDE RHK " + combos[c]);
for (c=0 ; c< num_RKH_combos; c++)
real_feature_names.push_back("DIS MIN, C SIDE RHK " + combos[c]);
for (c=0 ; c< num_RKH_combos; c++)
real_feature_names.push_back("DIS MAX, N SIDE RHK " + combos[c]);
for (c=0 ; c< num_RKH_combos; c++)
real_feature_names.push_back("DIS MAX, C SIDE RHK " + combos[c]);
int p;
for (p=0; p<4; p++)
{
int c;
for (c=0 ; c< num_RKH_combos; c++)
{
int aa;
for (aa=0; aa<num_aas; aa++)
{
string fname = prefixes[p] + combos[c] + ", AA N of cut " + this->model_aa_labels[aa];
real_feature_names.push_back(fname);
}
}
for (c=0 ; c< num_RKH_combos; c++)
{
int aa;
for (aa=0; aa<num_aas; aa++)
{
string fname = prefixes[p] + combos[c] + ", AA C of cut " + this->model_aa_labels[aa];
real_feature_names.push_back(fname);
}
}
}
/* real_feature_names.push_back("CUT MASS PROPORTION (LESS THAN MID)");
real_feature_names.push_back("CUT MASS PROPORTION (MORE THAN MID)");
real_feature_names.push_back("CUT MASS OVER CHARGE PROPORTION (LESS THAN MID OBS)");
real_feature_names.push_back("CUT MASS OVER CHARGE ABS DIFF (LESS THAN MID OBS)");
real_feature_names.push_back("CUT MASS OVER CHARGE PROPORTION (MORE THAN MID OBS)");
real_feature_names.push_back("CUT MASS OVER CHARGE ABS DIFF (MORE THAN MID OBS)"); */
for (i=0; i<num_aas; i++)
real_feature_names.push_back("# N SIDE " + model_aa_labels[i]);
real_feature_names.push_back("# N SIDE BASIC AMINO ACIDS");
for (i=0; i<num_aas; i++)
real_feature_names.push_back("# C SIDE " + model_aa_labels[i]);
real_feature_names.push_back("# C SIDE BASIC AMINO ACIDS");
}
struct basicity_pair {
bool operator< (const basicity_pair& other) const
{
return basicity_score<other.basicity_score;
}
float basicity_score;
int idx;
};
/****************************************************************************
// reoprts sizes of partitioned training sets
// partitioned according to charge / size_idx / mobility
// if path is given, creates the training files
*****************************************************************************/
void PeakRankModel::partition_training_samples(
const vector< vector<TrainingPeptide> >& all_tps,
char *file_path_prefix,
char *test_path_prefix,
int minimal_size,
float prop_ts) const
{
if (file_path_prefix)
cout << "TRAIN: " << file_path_prefix << endl;
if (test_path_prefix)
cout << "TEST : " << test_path_prefix << endl;
cout << endl << endl;
cout << "Training partiotioning: " << endl;
cout << "----------------------- " << endl << endl;
int charge;
for (charge=1; charge<all_tps.size(); charge++)
{
if (all_tps[charge].size()==0)
continue;
cout << endl << "**************************************" << endl;
cout << endl << "CHARGE " << charge << " total " << all_tps[charge].size() << " peptides..." << endl;
const int num_size_idxs = this->size_thresholds[charge].size()+1;
vector< vector<int> > pep_counts;
vector< vector< vector<int> > > pep_idxs;
pep_counts.resize(num_size_idxs);
pep_idxs.resize(num_size_idxs);
int size_idx;
for (size_idx=0; size_idx<num_size_idxs; size_idx++)
{
pep_counts[size_idx].resize(4,0);
pep_idxs[size_idx].resize(4);
}
int i;
for (i=0; i<all_tps[charge].size(); i++)
{
const TrainingPeptide& tp = all_tps[charge][i];
const int size_idx = this->get_size_group(charge,tp.pm_with_19);
pep_counts[size_idx][0]++;
pep_counts[size_idx][tp.mobility]++;
pep_idxs[size_idx][tp.mobility].push_back(i);
}
for (size_idx=0; size_idx<num_size_idxs; size_idx++)
{
cout << size_idx << "\t";
if (pep_counts[size_idx][0]>0)
{
cout << pep_counts[size_idx][0] << "\t" << fixed << setprecision(4) <<
pep_counts[size_idx][0] / (double)all_tps[charge].size() << "\t";
}
else
cout << " -\t -\t";
int m;
for (m=MOBILE; m<=NONMOBILE; m++)
if (pep_counts[size_idx][m]>0)
{
cout << pep_counts[size_idx][m];
if (pep_counts[size_idx][m]<minimal_size)
cout << " *";
cout << "\t" << pep_counts[size_idx][m]/(double)pep_counts[size_idx][0] << "\t";
}
else
cout << " -\t 0\t";
cout << endl;
}
cout << endl;
if (! file_path_prefix)
continue;
for (size_idx=0; size_idx<num_size_idxs; size_idx++)
{
int m;
for (m=MOBILE; m<=NONMOBILE; m++)
{
vector<int> idxs_to_write = pep_idxs[size_idx][m];
cout << endl;
cout << "set " << charge << " " << size_idx << " " << m << endl;
// try adding samples from adjacent size_idxs
if (0 && idxs_to_write.size()<minimal_size && size_idx>0)
{
const vector<int>& prev_size_idxs = pep_idxs[size_idx-1][m];
int i;
// sort samples according to size
vector<basicity_pair> pairs;
pairs.clear();
for (i=0; i<prev_size_idxs.size(); i++)
{
basicity_pair bp;
bp.idx = prev_size_idxs[i];
bp.basicity_score = all_tps[charge][prev_size_idxs[i]].pm_with_19;
pairs.push_back(bp);
}
sort(pairs.begin(),pairs.end());
for (i=pairs.size()-1; i>=0; i--)
{
if (idxs_to_write.size() == minimal_size)
break;
idxs_to_write.push_back(pairs[i].idx);
// cout << setprecision(2) << all_tps[charge][pairs[i].idx].pm_with_19 << " " <<
// all_tps[charge][pairs[i].idx].mobility << " ";
}
cout << endl;
cout << " .. took " << pairs.size()-i << " samples from size " << size_idx-1 << endl;
}
// try adding samples from adjacent size_idxs
if (0 && idxs_to_write.size()<minimal_size && size_idx<num_size_idxs-1)
{
const vector<int>& next_size_idxs = pep_idxs[size_idx+1][m];
int i;
// sort samples according to size
vector<basicity_pair> pairs;
pairs.clear();
for (i=0; i<next_size_idxs.size(); i++)
{
basicity_pair bp;
bp.idx = next_size_idxs[i];
bp.basicity_score = all_tps[charge][next_size_idxs[i]].pm_with_19;
pairs.push_back(bp);
}
sort(pairs.begin(),pairs.end());
for (i=0; i<pairs.size(); i++)
{
if (idxs_to_write.size() == minimal_size)
break;
idxs_to_write.push_back(pairs[i].idx);
// cout << setprecision(2) << all_tps[charge][pairs[i].idx].pm_with_19 << " " <<
// all_tps[charge][pairs[i].idx].mobility << " ";
}
cout << endl;
cout << " .. took " << i << " samples from size " << size_idx+1 << endl;
}
// try adding samples from another mobility state
// choose most appropriate samepls (in terms of basicity)
if (idxs_to_write.size()<minimal_size)
{
int mobility_to_take_from=0;
if (m==MOBILE || m == NONMOBILE)
{
mobility_to_take_from=PARTIALLYMOBILE;
}
else if (pep_idxs[size_idx][MOBILE].size()>
pep_idxs[size_idx][NONMOBILE].size())
{
mobility_to_take_from = MOBILE;
}
else
mobility_to_take_from = NONMOBILE;
if (idxs_to_write.size() + pep_counts[size_idx][mobility_to_take_from] < minimal_size)
{
cout << "Error: insufficient number of training samples for charge " <<
charge << " size idx " << size_idx << endl;
continue;
}
vector<basicity_pair> pairs;
int i;
pairs.clear();
for (i=0; i<pep_idxs[size_idx][mobility_to_take_from].size(); i++)
{
basicity_pair bp;
bp.idx = pep_idxs[size_idx][mobility_to_take_from][i];
bp.basicity_score = all_tps[charge][bp.idx].get_basicity_score();
pairs.push_back(bp);
}
sort(pairs.begin(),pairs.end());
if (mobility_to_take_from>m)
{
// take least basic peptides
int i;
for (i=0; i<pairs.size() && idxs_to_write.size()<minimal_size; i++)
idxs_to_write.push_back(pairs[i].idx);
cout << " .. took " << i << " samples mobility " << mobility_to_take_from << endl;
}
else
{
// take most basic peptides
int i;
for (i=pairs.size()-1; i>=0 && idxs_to_write.size()<minimal_size; i--)
{
idxs_to_write.push_back(pairs[i].idx);
// cout << pairs[i].basicity_score << " ";
}
cout << endl;
cout << " .. took " << pairs.size()-i << " samples mobility " << mobility_to_take_from << endl;
}
}
sort(idxs_to_write.begin(),idxs_to_write.end());
if (idxs_to_write.size()<minimal_size)
{
cout << "Error: could not add enough samples to create set for charge " << charge << " size " << size_idx << " mobility " << m << endl;
exit(1);
}
if (! test_path_prefix)
{
char fname[256];
sprintf(fname,"%s_%d_%d_%d.txt",file_path_prefix,charge,size_idx,m);
ofstream ofs(fname);
ofs << idxs_to_write.size() << endl;
for (i=0; i<idxs_to_write.size(); i++)
all_tps[charge][idxs_to_write[i]].write_to_stream(ofs);
ofs.close();
cout << "Wrote " << idxs_to_write.size() << " to " << fname << endl;
}
else
{
char fname[256], ts_name[256];
const int num_test = int(prop_ts * (float)idxs_to_write.size());
vector<int> ts_idxs;
vector<bool> ts_ind;
choose_k_from_n(num_test,idxs_to_write.size(),ts_idxs);
ts_ind.resize(idxs_to_write.size(),false);
int i;
for (i=0; i<ts_idxs.size(); i++)
ts_ind[ts_idxs[i]]=true;
sprintf(fname,"%s_%d_%d_%d.txt",file_path_prefix,charge,size_idx,m);
ofstream ofs(fname);
ofs << idxs_to_write.size() - num_test<< endl;
for (i=0; i<idxs_to_write.size(); i++)
if (! ts_ind[i])
all_tps[charge][idxs_to_write[i]].write_to_stream(ofs);
ofs.close();
cout << "Wrote " << idxs_to_write.size() - num_test << " to " << fname << endl;
sprintf(ts_name,"%s_%d_%d_%d.txt",test_path_prefix,charge,size_idx,m);
ofstream ofs_test(ts_name);
ofs_test << num_test<< endl;
for (i=0; i<idxs_to_write.size(); i++)
if (ts_ind[i])
all_tps[charge][idxs_to_write[i]].write_to_stream(ofs_test);
ofs_test.close();
cout << "Wrote " << num_test << " to " << ts_name << endl;
}
// find mix/max sizes and min/max basicity scores
mass_t min_pm_with_19 = 1000000.0;
mass_t max_pm_with_19 = 0.0;
float min_basicity_score = 10000.0;
float max_basicity_score = -1.0;
for (i=0; i<idxs_to_write.size(); i++)
{
const TrainingPeptide& tp = all_tps[charge][idxs_to_write[i]];
const float bs = tp.get_basicity_score();
if (tp.pm_with_19>max_pm_with_19)
max_pm_with_19 = tp.pm_with_19;
if (tp.pm_with_19<min_pm_with_19)
min_pm_with_19 = tp.pm_with_19;
if (bs<min_basicity_score)
min_basicity_score = bs;
if (bs>max_basicity_score)
max_basicity_score = bs;
}
cout << "pm ranges: " << setprecision(2) << min_pm_with_19 << " - " << max_pm_with_19 << endl;
cout << "basicity : " << min_basicity_score << " - " << max_basicity_score << endl;
cout << "-------------------------------------" << endl;
}
}
}
}
/***************************************************************************
Reads the sample tps into the dataset and creats sample for a given
fragment type idx
****************************************************************************/
void PeakRankModel::read_training_peptides_into_rank_boost_dataset(
int frag_type_idx,
int spec_charge,
const vector<TrainingPeptide>& sample_tps,
RankBoostDataset& rank_ds,
vector<float>& peak_intens,
vector<PeakStart>& peak_starts,
vector<float>& max_annotated_intens) const
{
const vector<mass_t>& aa2mass = config->get_aa2mass();
rank_ds.clear();
peak_intens.clear();
peak_starts.clear();
max_annotated_intens.clear();
int i;
for (i=0; i<sample_tps.size(); i++)
{
const TrainingPeptide& tp = sample_tps[i];
const int frag_pos = tp.get_frag_idx_pos(frag_type_idx);
if (frag_pos<0)
continue;
PeakStart ps;
ps.peptide_sample_idx = i;
ps.peak_start_idx=rank_ds.get_num_samples();
ps.num_peaks=0;
// scan all cuts for max inten
float max_ann_inten=0;
int f;
for (f=0; f<tp.intens.size(); f++)
{
int c;
for (c=1; c<tp.intens[f].size()-1; c++)
if (tp.intens[f][c]>max_ann_inten)
max_ann_inten=tp.intens[f][c];
}
if (max_ann_inten<=0)
continue;
const vector<float>& intens = tp.intens[frag_pos];
vector<int> cut_ranks;
// gives integer values to cut idxs according to intensity
calc_cut_ranks(intens,cut_ranks);
mass_t c_term_mass = tp.n_mass;
int cut_idx;
for (cut_idx=1; cut_idx<intens.size(); cut_idx++)
c_term_mass+=aa2mass[tp.amino_acids[cut_idx-1]];
mass_t cut_mass=tp.n_mass;
for (cut_idx=1; cut_idx<intens.size(); cut_idx++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -