📄 peakrank_combined.cpp
字号:
#include "PeakRankModel.h"
#include "auxfun.h"
extern const int num_RKH_combos;
extern const int num_RKH_pairs;
extern const string combos[];
extern const float RKH_pair_matrix[6][6];
/************************************************************************
A note about the different structure of functions/classes when the
combined features are used. Since each partition has different fragments,
which induce different fragment names, most of the model informaiton was
pushed into the partition level, rather than the PeakRankModel level, with
the other models that looked at each fragment independently.
*************************************************************************/
int PartitionModel::read_combined_partition_model(const string& path, Config *config,
int _charge, int _size_idx, int _mobility)
{
const int max_global_frag_idx = config->get_all_fragments().size();
charge = _charge;
size_idx = _size_idx;
mobility = _mobility;
max_frag_idx = -1;
fragment_type_idxs.clear();
ostringstream oss;
oss << path << "_" << charge << "_" << this->size_idx << "_" << mobility << "_model.txt";
ifstream boost_file(oss.str().c_str());
if (! boost_file.is_open() ||! boost_file.good())
{
cout << "Warnind: Couldn\'t find " << oss.str() << endl;
return 0;
}
// read feature type and frags
char buff[256];
boost_file.getline(buff,256);
this->feature_set_type = NEG_INF;
int num_frags = NEG_INF;
istringstream iss(buff);
iss >> feature_set_type >> num_frags;
if (this->feature_set_type <=2)
{
cout << "Error: read_combined_partition_model works only on frag_set 3 and up!" << endl;
exit(1);
}
int f;
for (f=0; f<num_frags; f++)
{
int frag=NEG_INF;
iss >> frag;
if (frag<0)
{
cout << "Error: bad frags in combined model!" << endl;
exit(1);
}
fragment_type_idxs.push_back(frag);
if (frag>max_frag_idx)
max_frag_idx = frag;
}
combined_frag_boost_model.read_rankboost_model(boost_file);
boost_file.close();
num_features_per_frag= (combined_frag_boost_model.get_num_real_features()-1) /
fragment_type_idxs.size();
return fragment_type_idxs.size();
}
int PartitionModel::write_combined_partition_model(const string& path)
{
if (this->feature_set_type <=2)
{
cout << "Error: read_combined_partition_model works only on frag_set 3 and up!" << endl;
exit(1);
}
ostringstream oss;
fstream boost_file(path.c_str(),ios::out);
if (! boost_file.is_open() || ! boost_file.good())
{
cout << "Error: couldn't open file for wiriting!" << endl;
exit(1);
}
boost_file << feature_set_type << " " << fragment_type_idxs.size();
int f;
for (f=0; f<fragment_type_idxs.size(); f++)
boost_file << " " << fragment_type_idxs[f];
boost_file << endl;
combined_frag_boost_model.write_rankboost_model(boost_file,true);
return 1;
}
/*****************************************************************************
Creates the phi domain for the rank boost training.
phi contains pairs of idx0,idx1 and weight.
idx0 and idx1 are peak sample indexes while weight is the relative intensity
between them. The weight is normalized according to the maximal intensity of a
peak from the same frag type in the peptide (e.g., the strongest y is given
intensity 1 while a missing y is given intensity 0. The weight of x,y is set to
intensity[y]-intensity[x] (it is assumed that for all pairs, y is stronger than x).
******************************************************************************/
void create_phi_list_from_combined_peak_samples(const vector<float>& peak_intens,
const vector<int> & peak_frag_types,
const vector<PeakStart>& peak_starts,
const int max_frag_idx,
vector<SamplePairWeight>& phi)
{
const weight_t min_norm_weight = 0.2;
int i;
phi.clear();
for (i=0; i<peak_starts.size(); i++)
{
const int start_idx = peak_starts[i].peak_start_idx;
const int num_peaks = peak_starts[i].num_peaks;
vector<float> max_intens_per_type;
max_intens_per_type.resize(max_frag_idx+1,0); //
vector<float> intens;
intens.resize(num_peaks,-1);
float max_inten = -1;
int j;
for (j=0; j<num_peaks; j++)
{
const int peak_idx = start_idx+j;
const float peak_inten = peak_intens[peak_idx];
const int peak_frag_type = peak_frag_types[peak_idx];
intens[j] = peak_inten;
if (peak_inten>max_inten)
max_inten=peak_inten;
if (peak_inten>max_intens_per_type[peak_frag_type])
max_intens_per_type[peak_frag_type]=peak_inten;
}
if (max_inten<=0)
{
continue;
}
const float one_over_max = 1.0/max_inten;
for (j=0; j<num_peaks; j++)
intens[j] *= one_over_max;
for (j=0; j<=max_frag_idx; j++)
max_intens_per_type[j]*= one_over_max;
// look at all pairs, only consider pairs for which inten[idx1]>inten[idx0]
for (j=0; j<num_peaks; j++)
{
int k;
for (k=0; k<num_peaks; k++)
if (intens[k]>intens[j])
{
// ignore a pair of peaks if the difference between them is less than
// 0.15 of the maximum intensity (there might be too many misordered
// pairs in that range
if ( (intens[j]/intens[k]) > 0.7)
continue;
weight_t pair_weight = 0.4 + 0.6*intens[k] - 0.4*(intens[j]/intens[k]);
if (pair_weight<min_norm_weight)
continue;
// don't let very weak peaks participate in a missing peak pair
if (intens[j]==0 && intens[k]<0.15)
continue;
phi.push_back(SamplePairWeight(start_idx+j,start_idx+k,pair_weight));
}
}
}
cout << "Created " << phi.size() << " pairs for " << peak_starts.size() << " peptide samples" << endl;
}
void PartitionModel::train_combined_partition_model(
const PeakRankModel *prank,
char *sample_file_path,
int _charge,
int _size_idx,
int _mobility,
int num_frags_to_choose,
char *report_dir,
int max_num_rounds,
char *test_set_file,
int test_peptide_length,
char *stop_signal_file,
weight_t max_weight_ratio)
{
const float min_ratio_detected = 0.15;
Config *config = prank->get_config();
const vector<FragmentType>& all_fragments = config->get_all_fragments();
const vector<mass_t>& aa2mass = config->get_aa2mass();
const int num_all_frags = all_fragments.size();
if (max_num_rounds<0)
max_num_rounds = 1000;
charge = _charge;
size_idx = _size_idx;
mobility = _mobility;
vector<TrainingPeptide> sample_tps, test_tps;
vector<int> frag_counts;
vector<int> frag_detected;
vector<int> length_counts;
if (this->feature_set_type <=2)
{
cout << "Error: train_combined_partition_model works only on frag_set 3 and up!" << endl;
exit(1);
}
read_training_peptides_from_file(sample_file_path,sample_tps);
cout << "Read " << sample_tps.size() << " training tps...";
int num_tps_to_add = 0; // for de novo models
if (prank->get_feature_set_type() == 4)
{
if (sample_tps.size()<15000)
num_tps_to_add = 1;
if (sample_tps.size()<10000)
num_tps_to_add = 2;
if (sample_tps.size()<5000)
num_tps_to_add = 3;
if (charge>=3 || size_idx>=3)
num_tps_to_add /= 2;
cout << "Adding at most " << num_tps_to_add << " per tp." << endl;
}
if (prank->get_feature_set_type() == 4)
{
cout << "Converting train: " << sample_tps.size() << " --> ";
convert_tps_to_partial_denovo(config,sample_tps,num_tps_to_add,false);
cout << sample_tps.size() << endl;
}
if (test_set_file)
{
read_training_peptides_from_file(test_set_file, test_tps);
cout << "Read " << test_tps.size() << " test tps...";
if (prank->get_feature_set_type() == 4)
{
cout << "Converting train: " << test_tps.size() << " --> ";
convert_tps_to_partial_denovo(config,test_tps,num_tps_to_add);
cout << test_tps.size() << endl;
}
}
// Create intial report on dataset
frag_counts.resize(num_all_frags,0);
frag_detected.resize(num_all_frags,0);
length_counts.resize(200,0);
int numH=0,numK=0, numR=0;
int i,f;
for (i=0; i<sample_tps.size(); i++)
{
const TrainingPeptide& tp = sample_tps[i];
int f;
for (f=0; f<tp.intens.size(); f++)
{
int j;
for (j=1; j<tp.intens[f].size(); j++)
{
if (tp.intens[f][j]>=0)
frag_counts[tp.frag_idxs[f]]++;
if (tp.intens[f][j]>0)
frag_detected[tp.frag_idxs[f]]++;
}
}
int j;
for (j=0; j<tp.amino_acids.size(); j++)
{
if (tp.amino_acids[j]==His)
numH++;
if (tp.amino_acids[j]==Lys)
numK++;
if (tp.amino_acids[j]==Arg)
numR++;
}
length_counts[tp.amino_acids.size()]++;
}
// report and select fragments for training
cout << "# training peptides: " << sample_tps.size() << endl;
cout << "Avg #R: " << numR/(double)sample_tps.size() << endl;
cout << "Avg #K: " << numK/(double)sample_tps.size() << endl;
cout << "Avg #H: " << numH/(double)sample_tps.size() << endl;
cout << endl << "Sample lengths:" << endl;
for (i=0; i<length_counts.size(); i++)
if (length_counts[i]>0)
cout << i << "\t" << length_counts[i] << endl;
cout << endl;
vector<float> frag_ratios;
frag_ratios.resize(all_fragments.size(),0);
cout << "Detected fragments: " << endl;
for (i=0; i<all_fragments.size(); i++)
{
const float ratio = (frag_counts[i]>0 ? (float)frag_detected[i]/frag_counts[i] : 0);
cout << all_fragments[i].label << "\t" << frag_detected[i] << " / " <<
frag_counts[i] << "\t = " << setprecision(3) << ratio << endl;
if (ratio>=min_ratio_detected)
frag_ratios[i]=ratio;
}
cout << endl;
// choose the most abundant fragments
cout << "Selected: ";
this->fragment_type_idxs.clear();
int max_frag_idx=-1;
for (f=0; f<num_frags_to_choose; f++)
{
float max_ratio=0;
int max_idx=-1;
int j;
for (j=0; j<frag_ratios.size(); j++)
if (frag_ratios[j]>max_ratio)
{
max_ratio=frag_ratios[j];
max_idx=j;
}
if (max_idx<0)
break;
fragment_type_idxs.push_back(max_idx);
frag_ratios[max_idx]=-1;
cout << " " << all_fragments[max_idx].label;
if (max_idx>max_frag_idx)
max_frag_idx=max_idx;
}
cout << endl;
if (fragment_type_idxs.size() == 0)
{
cout << "No fragments to train!" << endl;
return;
}
if (prank->get_feature_set_type() == 3)
{
set_combined_feature_names_in_rankboost_model(prank);
}
else if (prank->get_feature_set_type() == 4)
{
set_combined_dnv_feature_names_in_rankboost_model(prank);
}
else if (prank->get_feature_set_type() == 5)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -