📄 pmc_rank.cpp
字号:
#include "PMCSQS.h"
#include "auxfun.h"
bool PMCSQS_Scorer::read_pmc_rank_models(Config *_config, char *file_name)
{
config = _config;
string path = config->get_resource_dir() + "/" + file_name;
ifstream in_stream(path.c_str(),ios::in);
if (! in_stream.good())
{
cout << "Warning: couldn't open pmc rank model for reading: " << path << endl;
return false;
}
char buff[512];
int num_charges=-1;
in_stream.getline(buff,256);
istringstream iss1(buff);
frag_pair_sum_offset=NEG_INF;
bin_increment=NEG_INF;
iss1 >> bin_increment >> this->frag_pair_sum_offset;
if (frag_pair_sum_offset==NEG_INF || bin_increment == NEG_INF)
{
cout << "Error in pmc model file!" << endl;
exit(1);
}
in_stream.getline(buff,256);
istringstream iss(buff);
iss >> num_charges;
max_model_charge=num_charges-1;
pmc_rank_models.resize(num_charges);
pmc_rank_mass_thresholds.resize(num_charges);
pmc_charge_mz_biases.resize(num_charges);
int i;
for (i=0; i<num_charges; i++)
{
in_stream.getline(buff,256);
istringstream iss(buff);
int num_threshes=0;
iss >> num_threshes;
pmc_rank_mass_thresholds[i].resize(num_threshes,NEG_INF);
int j;
for (j=0; j<num_threshes; j++)
iss >> pmc_rank_mass_thresholds[i][j];
}
for (i=0; i<num_charges; i++)
{
in_stream.getline(buff,256);
istringstream iss(buff);
int num_biases=0;
iss >> num_biases;
pmc_charge_mz_biases[i].resize(num_biases,NEG_INF);
int j;
for (j=0; j<num_biases; j++)
iss >> pmc_charge_mz_biases[i][j];
}
// read Boost models
for (i=0; i<num_charges; i++)
{
in_stream.getline(buff,256);
istringstream iss(buff);
int num_models=-1;
iss >> num_models;
if (num_models<0)
{
cout << "Error: bad parsing of PMCR model file!" << endl;
exit(0);
}
pmc_rank_models[i].resize(num_models,NULL);
int j;
for (j=0; j<num_models; j++)
{
pmc_rank_models[i][j]=new RankBoostModel;
pmc_rank_models[i][j]->read_rankboost_model(in_stream);
}
}
in_stream.close();
// this->write_pmc_rank_models("XXX.txt");
this->ind_initialized_pmcr = true;
return true;
}
void PMCSQS_Scorer::write_pmc_rank_models(const char *path) const
{
ofstream out_stream(path,ios::out);
if (! out_stream.good())
{
cout << "Error: couldn't open pmc model for writing: " << path << endl;
exit(1);
}
out_stream << this->bin_increment << " " << this->frag_pair_sum_offset << endl;
out_stream << this->pmc_rank_mass_thresholds.size() << endl;
out_stream << setprecision(3) << fixed;
int i;
for (i=0; i<this->pmc_rank_mass_thresholds.size(); i++)
{
out_stream << pmc_rank_mass_thresholds[i].size();
int j;
for (j=0; j<pmc_rank_mass_thresholds[i].size(); j++)
out_stream << " " << pmc_rank_mass_thresholds[i][j];
out_stream << endl;
}
for (i=0; i<this->pmc_charge_mz_biases.size(); i++)
{
out_stream << pmc_charge_mz_biases[i].size();
int j;
for (j=0; j<pmc_charge_mz_biases[i].size(); j++)
out_stream << " " << pmc_charge_mz_biases[i][j];
out_stream << endl;
}
// write RankBoost models
for (i=0; i<pmc_rank_models.size(); i++)
{
int j;
if (pmc_rank_models[i].size()==1 && ! pmc_rank_models[i][0])
{
out_stream << 0 << endl;
continue;
}
out_stream << pmc_rank_models[i].size() << endl;
for (j=0; j<pmc_rank_models[i].size(); j++)
{
if (pmc_rank_models[i][j])
{
pmc_rank_models[i][j]->write_rankboost_model(out_stream,true);
}
else
{
cout << "Error: non intialized rank pmc model!" << endl;
exit(1);
}
}
}
out_stream.close();
}
void PMCSQS_Scorer::set_pmc_mass_thresholds(int option)
{
if (option==0)
{
pmc_rank_mass_thresholds = config->get_size_thresholds();
int i;
for (i=0; i<pmc_rank_mass_thresholds.size(); i++)
{
if (pmc_rank_mass_thresholds[i].size()>0)
{
if (pmc_rank_mass_thresholds[i][pmc_rank_mass_thresholds[i].size()-1]>10000)
pmc_rank_mass_thresholds[i].pop_back();
}
}
}
if (option==1)
{
pmc_rank_mass_thresholds.resize(4);
pmc_rank_mass_thresholds[1].push_back(1150.0);
pmc_rank_mass_thresholds[1].push_back(1400.0);
pmc_rank_mass_thresholds[2].push_back(1100.0);
pmc_rank_mass_thresholds[2].push_back(1300.0);
pmc_rank_mass_thresholds[2].push_back(1600.0);
pmc_rank_mass_thresholds[2].push_back(1900.0);
pmc_rank_mass_thresholds[2].push_back(2400.0);
pmc_rank_mass_thresholds[3].push_back(1950.0);
pmc_rank_mass_thresholds[3].push_back(2450.0);
pmc_rank_mass_thresholds[3].push_back(3000.0);
}
if (option==2)
{
pmc_rank_mass_thresholds.resize(4);
pmc_rank_mass_thresholds[2].push_back(1300.0);
pmc_rank_mass_thresholds[2].push_back(1900.0);
}
}
void convert_ME_to_RankBoostSample(const ME_Regression_Sample& me,
RankBoostSample& rbs)
{
rbs.clear();
int i;
for (i=0; i<me.f_vals.size(); i++)
rbs.add_real_feature(me.f_vals[i].f_idx,me.f_vals[i].val);
}
/******************************************************************************
Train PMC models from positive example files
*******************************************************************************/
void PMCSQS_Scorer::train_pmc_rank_models(Config *config, const FileManager& fm,
int sel_charge, bool overwrite)
{
const bool sample_diagnostic = false;
const vector<int>& spectra_counts = fm.get_spectra_counts();
max_model_charge=0;
int charge;
for (charge=1; charge<spectra_counts.size(); charge++)
{
if (spectra_counts[charge]>=MIN_SPECTRA_FOR_PMCSQS_MODEL)
max_model_charge=charge;
}
const int max_to_read_per_file = 40000;
vector<string> real_names;
init_PMC_feature_names(real_names);
// try and read existing pmc model, otherwise init a new one
string pmc_path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
ifstream model_stream(pmc_path.c_str());
if (model_stream.is_open() && model_stream.good())
{
model_stream.close();
string pmcr_name = config->get_model_name() + "_PMCR.txt";
const char *path = pmc_path.c_str();
this->read_pmc_rank_models(config,(char *)pmcr_name.c_str());
}
else
{
set_pmc_mass_thresholds();
this->set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19
this->set_bin_increment(0.1);
pmc_rank_models.resize(pmc_rank_mass_thresholds.size());
pmc_charge_mz_biases.resize(pmc_rank_mass_thresholds.size());
}
const double prop_train = 0.5;
// It is assumed that the mass thresholds were set according to the training data
// (this is done manually with values encoded in the set_mass_threhsolds function)
for (charge=1; charge<=max_model_charge; charge++)
{
if (sel_charge>0 && charge != sel_charge)
continue;
const int num_sizes = pmc_rank_mass_thresholds[charge].size();
pmc_rank_models[charge].resize(num_sizes+1,NULL);
pmc_charge_mz_biases[charge].resize(num_sizes+1,0);
int size_idx;
for (size_idx=0; size_idx<=num_sizes; size_idx++)
{
if (pmc_rank_models[charge][size_idx] && ! overwrite)
continue;
vector<SingleSpectrumFile *> test_ssfs;
BasicSpecReader bsr;
static QCPeak peaks[5000];
RankBoostDataset train_ds, test_ds, pos_ds, neg_ds;
mass_t min_mass =0;
mass_t max_mass = POS_INF;
if (size_idx>0)
min_mass = pmc_rank_mass_thresholds[charge][size_idx-1];
if (size_idx<num_sizes)
max_mass = pmc_rank_mass_thresholds[charge][size_idx];
// these ranges are given according to pm_with_19
// so files should be selected through select_files and not
// select_file_in_mz_range
FileSet fs;
fs.select_files(fm,min_mass,max_mass,-1,-1,charge);
if (fs.get_total_spectra()<500)
continue;
int num_groups_in_train=0;
int num_groups_in_test=0;
cout << "TRAINING charge " << charge << " size " << size_idx << " (" <<
min_mass << "-" << max_mass << ")" << endl;
fs.randomly_reduce_ssfs(max_to_read_per_file);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
const int num_samples = all_ssf.size();
// first find the bias in number of bins between the true m/z bin and
// the optimal m/z bin
vector<bool> skipped_idxs;
skipped_idxs.resize(num_samples,false);
int skipped_bad_mz=0;
mass_t total_bias=0;
int i;
for (i=0; i<num_samples; i++)
{
SingleSpectrumFile* ssf = all_ssf[i];
BasicSpectrum bs;
bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.ssf = ssf;
ssf->peptide.calc_mass(config);
const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;
if (fabs(true_mz - bs.ssf->m_over_z)>2.5)
{
//cout << setprecision(2) << true_mz << " <---> " << bs.ssf->m_over_z << " skipping" << endl;
skipped_bad_mz++;
skipped_idxs[i]=true;
continue;
}
init_for_current_spec(config,bs);
calculate_curr_spec_pmc_values(bs, bin_increment);
// find the true_mz_bin_idx
const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
int true_mz_bin_idx=0;
while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
true_mz_bin_idx++;
if (true_mz_bin_idx == pmc_stats.size())
true_mz_bin_idx--;
if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
true_mz_bin_idx--;
int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);
if (opt_bin_idx <=0 || opt_bin_idx == pmc_stats.size()-1)
{
skipped_bad_mz++;
skipped_idxs[i]=true;
continue;
}
total_bias += (pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z);
if (fabs(pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z)>4.0)
{
cout << "opt bin: " << opt_bin_idx << " (" << pmc_stats[opt_bin_idx].m_over_z << ") ";
cout << "tru bin: " << true_mz_bin_idx << " ("<< pmc_stats[true_mz_bin_idx].m_over_z << ")" << endl;
}
}
mass_t mz_bias = total_bias / (mass_t)(num_samples-skipped_bad_mz);
pmc_charge_mz_biases[charge][size_idx]=mz_bias;
cout << "m/z bias: " << setprecision(4) << mz_bias << endl;
cout << "skipped " << skipped_bad_mz << "/" << num_samples <<
" because of m/z more than 2.5 away from observed..." << endl;
// pmc_charge_mz_biases[charge][size_idx] = 0;
for (i=0; i<num_samples; i++)
{
if (skipped_idxs[i])
continue;
SingleSpectrumFile* ssf = all_ssf[i];
BasicSpectrum bs;
bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.ssf = ssf;
const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;
init_for_current_spec(config,bs);
calculate_curr_spec_pmc_values(bs, bin_increment);
// find the true_mz_bin_idx
const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
int true_mz_bin_idx=0;
while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
true_mz_bin_idx++;
if (true_mz_bin_idx == pmc_stats.size())
true_mz_bin_idx--;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -