📄 denovoranktrain.cpp
字号:
char buff[2048];
ifs.getline(buff,2048);
if (ifs.gcount()>2046)
cout << "Warning: buffer size?" << endl;
if (ifs.gcount()<5)
continue;
istringstream iss(buff);
// >> 92 0 6_6260 EGSSLLGSDAGELAGAGK 1618.79
string dummy;
int dummy_file;
int scan=-1;
string title;
string correct_pep_str;
mass_t mass_with_19 = -1;
iss >> dummy >> dummy_file >> scan >> title >> correct_pep_str >> mass_with_19;
if (scan<0 || mass_with_19<0 || mass_with_19>10000)
{
cout << scan << "\t" << mass_with_19 << endl;
cout << "Error parsing line in file " << denovo_file << endl << "LINE:" <<buff << endl;
exit(1);
}
int num_seqs=0;
ifs.getline(buff,2048);
sscanf(buff,"%d",&num_seqs);
if (num_seqs<=0 || num_seqs>500)
{
cout << "Error parsing line in file " << denovo_file << endl << "LINE:" <<buff << endl;
exit(1);
}
bool has_correct = false;
bool correct_first = false;
PeptideSet set;
set.file_idx=file_idx;
set.scan = scan;
set.correct_sol.charge = file_charge;
set.correct_sol.type = SOL_CORRECT;
set.correct_sol.pep.parse_from_string(config,correct_pep_str);
set.correct_sol.pep.set_aa_before(-1);
set.correct_sol.pep.set_aa_after(-1);
set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
set.correct_sol.reaches_n_terminal = true;
set.correct_sol.reaches_c_terminal = true;
vector<string> inserted_strings;
inserted_strings.clear();
int i;
for (i=0; i<num_seqs; i++)
{
ifs.getline(buff,2048);
mass_t start_mass=-1;
int num_correct_aas =-1;
int num_aas = -1;
string denovo_seq="";
istringstream iss(buff);
iss >> start_mass >> num_correct_aas >> num_aas >> denovo_seq;
if (num_correct_aas==0 && num_aas==0)
continue;
if (start_mass<0 || start_mass > 10000 || num_correct_aas<0 ||
num_aas<0 || num_aas>100 || num_correct_aas> num_aas ||
denovo_seq.length()<3)
{
cout << "Error parsing " << denovo_file << endl;
cout << "LINE: " << buff << endl;
exit(1);
}
// check if peptide is same as correct don't add it
if (denovo_seq == correct_pep_str || num_correct_aas == num_aas)
{
has_correct = true;
if (i==0)
correct_first = true;
continue;
}
int j;
for (j=0; j<inserted_strings.size(); j++)
if (inserted_strings[j] == denovo_seq)
break;
if (j<inserted_strings.size())
continue;
// check how similar the cuts are
inserted_strings.push_back(denovo_seq);
PeptideSolution sol;
sol.charge = file_charge;
sol.pep.parse_from_string(config,denovo_seq);
sol.pep.set_aa_before(-1);
sol.pep.set_aa_after(-1);
sol.num_correct_aas = num_correct_aas;
sol.type=SOL_INCORRECT_DENOVO;
sol.pm_with_19 = sol.pep.get_mass_with_19();
sol.reaches_n_terminal = true;
sol.reaches_c_terminal = true;
set.incorrect_sols.push_back(sol);
}
if (add_peptide_set_to_psm(psm,set))
total_num_sets++;
num_sets++;
if (has_correct)
num_with_correct++;
if (correct_first)
num_first_correct++;
}
ifs.close();
cout << "\tsets added: " << num_sets << endl;
cout << "\thad correct: " << float(num_with_correct)/num_sets << endl;
cout << "\tcorrect first:" << float(num_first_correct)/num_sets << endl;
cout << endl;
}
cout << "Processed " << num_read << "/" << names.size() << " denovo files" << endl;
}
/*******************************************************************************
Reads the sets of predicted peptides:
list of db hits amd of complete de novo sequences for the spectrum.
********************************************************************************/
void create_complete_denovo_set_map(
Config *config,
const string& mgf_list,
const string& db_dir,
const string& correct_dir,
const string& denovo_dir,
int charge,
int size_idx,
PeptideSetMap& psm,
vector<bool>& file_indicators)
{
vector<string> base_names,list;
read_paths_into_list(mgf_list.c_str(), list);
base_names.resize(list.size());
int i;
for (i=0; i<list.size(); i++)
get_file_name_without_extension(list[i],base_names[i]);
file_indicators.clear();
file_indicators.resize(list.size(),false);
// create map
psm.clear();
char suf_buf[64];
sprintf(suf_buf,"_dbh_%d_%d.txt",charge,size_idx);
string db_suffix = string(suf_buf);
add_db_hits_to_psm(config,base_names,db_dir,correct_dir,db_suffix,charge,psm);
sprintf(suf_buf,"_full_dnv_%d_%d.txt",charge,size_idx);
string dnv_full_suffix = string(suf_buf);
// add_denovo_paths_to_psm(config,base_names,denovo_dir,db_suffix,charge,psm);
add_denovo_dbh_to_psm(config,base_names,denovo_dir,correct_dir,db_suffix,charge,psm);
//
cout << "Read info for charge " << charge << " size " << size_idx << endl;
vector<int> type_sum;
type_sum.resize(10,0);
int num_sets=0;
int num_incorrect=0;
for (PeptideSetMap::const_iterator it = psm.begin(); it != psm.end(); it++)
{
num_sets++;
file_indicators[(*it).first.file_idx]=true;
num_incorrect += (*it).second.incorrect_sols.size();
int j;
for (j=0; j<(*it).second.incorrect_sols.size(); j++)
type_sum[(*it).second.incorrect_sols[j].type]++;
}
cout << "\t" << num_sets << "\tsets" << endl;
cout << "\t" << num_incorrect <<"\tnegtaive samples" << endl;
for (i=0; i<type_sum.size(); i++)
if (type_sum[i]>0)
cout << "\ttype " << i << " " << type_sum[i] << endl;
}
/*******************************************************************************
Reads the sets of predicted peptides:
list of db hits amd of complete de novo sequences for the spectrum.
********************************************************************************/
void create_complete_dbh_set_map(
Config *config,
const string& mgf_list,
const string& db_dir,
const string& correct_dir,
int charge,
int size_idx,
PeptideSetMap& psm,
vector<bool>& file_indicators)
{
vector<string> base_names,list;
read_paths_into_list(mgf_list.c_str(), list);
base_names.resize(list.size());
int i;
for (i=0; i<list.size(); i++)
get_file_name_without_extension(list[i],base_names[i]);
file_indicators.clear();
file_indicators.resize(list.size(),false);
// create map
psm.clear();
char suf_buf[64];
sprintf(suf_buf,"_dbh_%d_%d.txt",charge,size_idx);
string db_suffix = string(suf_buf);
add_db_hits_to_psm(config,base_names,db_dir,correct_dir,db_suffix,charge,psm);
//
cout << "Read info for charge " << charge << " size " << size_idx << endl;
vector<int> type_sum;
type_sum.resize(10,0);
int num_sets=0;
int num_incorrect=0;
for (PeptideSetMap::const_iterator it = psm.begin(); it != psm.end(); it++)
{
num_sets++;
file_indicators[(*it).first.file_idx]=true;
num_incorrect += (*it).second.incorrect_sols.size();
int j;
for (j=0; j<(*it).second.incorrect_sols.size(); j++)
type_sum[(*it).second.incorrect_sols[j].type]++;
}
cout << "\t" << num_sets << "\tsets" << endl;
cout << "\t" << num_incorrect <<"\tnegtaive samples" << endl;
for (i=0; i<type_sum.size(); i++)
if (type_sum[i]>0)
cout << "\ttype " << i << " " << type_sum[i] << endl;
}
// returns the total number of pairs in the psm
// that had weights assigned
// reassigns weights to sets according to weights of db / full de novo
int assign_denovo_weights_to_sets(PeptideSetMap& psm,
float ratio_db,
float ratio_full_denovo)
{
const float sum = ratio_db + ratio_full_denovo;
const float req_type1_weight = ratio_db/sum;
const float req_type2_weight = 1 - req_type1_weight;
int total_num_pairs =0;
for (PeptideSetMap::iterator it = psm.begin(); it != psm.end(); it++)
{
PeptideSet& set = (*it).second;
int num_type1=0;
int num_type2=0;
float total_type1_weight=0;
float total_type2_weight=0;
// set weight according to proportion of correct amino acids
int i;
for (i=0; i<set.incorrect_sols.size(); i++)
{
PeptideSolution& sol = set.incorrect_sols[i];
const float correct_aa_ratio = ((float)sol.num_correct_aas/(float)sol.pep.get_num_aas());
sol.weight = 1.0 - correct_aa_ratio;
if (sol.type==1)
{
num_type1++;
total_type1_weight+=sol.weight;
}
else if (sol.type==2)
{
num_type2++;
total_type2_weight+=sol.weight;
}
else
{
cout << "Error: unknown de novo sample type: " << sol.type << endl;
exit(1);
}
}
total_num_pairs+=set.incorrect_sols.size();
const float mul1= (num_type1>0 ? req_type1_weight/total_type1_weight : 0);
const float mul2= (num_type2>0 ? req_type2_weight/total_type2_weight : 0);
set.total_set_weight=0;
for (i=0; i<set.incorrect_sols.size(); i++)
{
PeptideSolution& sol = set.incorrect_sols[i];
sol.weight *= (sol.type == 1 ? mul1 : mul2);
set.total_set_weight += sol.weight;
}
}
return total_num_pairs;
}
// also removes pairs with weight 0
void re_weight_complete_denovo_phi_support(vector<SamplePairWeight>& phi_support,
float ratio_db, float ratio_denovo, float ratio_db_cross)
{
const int num_types = 4;
const double ratio_sum = (double)(ratio_db + ratio_denovo + ratio_db_cross);
const double per_weights[num_types] = {0, (double)ratio_db / ratio_sum, (double)ratio_denovo/ratio_sum,
(double)ratio_db_cross / ratio_sum};
double total_weights[num_types]={0};
int num_samples[num_types]={0};
int i;
for (i=0; i<phi_support.size(); i++)
{
const int type = phi_support[i].tag;
total_weights[type]+=phi_support[i].weight;
num_samples[type]++;
}
double total_weight=0;
for (i=0; i<num_types; i++)
total_weight+=total_weights[i];
double mult_vals[num_types]={0};
for (i=0; i<num_types; i++)
if (per_weights[i]>0)
mult_vals[i]= (per_weights[i]*total_weight) / total_weights[i];
for (i=0; i<phi_support.size(); i++)
phi_support[i].weight *= mult_vals[phi_support[i].tag];
for (i=0; i<phi_support.size(); i++)
{
if (phi_support[i].weight<=0)
{
phi_support[i]=phi_support[phi_support.size()-1];
phi_support.pop_back();
}
}
for (i=0; i<num_types; i++)
{
total_weights[i]=0;
num_samples[i]=0;
}
for (i=0; i<phi_support.size(); i++)
{
const int type = phi_support[i].tag;
total_weights[type]+=phi_support[i].weight;
num_samples[type]++;
}
total_weight=0;
for (i=0; i<num_types; i++)
total_weight+=total_weights[i];
cout << endl << "Reweighted samples in set: " << endl;
for (i=0; i<num_types; i++)
if (num_samples[i]>0)
cout << "Type " << i << " " << num_samples[i] << "\t" << per_weights[i] << "\t" << " : " <<
total_weights[i] / total_weight << endl;
cout << endl;
}
bool are_same_upto_NDILQK(const Peptide& pep1, const Peptide& pep2)
{
if (pep1.get_num_aas() != pep2.get_num_aas())
return false;
const vector<int>& aas1 = pep1.get_amino_acids();
const vector<int>& aas2 = pep2.get_amino_acids();
int i;
for (i=0; i<aas1.size()-1; i++)
{
if (aas1[i] == aas2[i] )
continue;
if ((aas1[i] == Ile && aas2[i] == Leu) ||
(aas1[i] == Leu && aas2[i] == Ile) ||
(aas1[i] == Asn && aas2[i] == Asp) ||
(aas1[i] == Asp && aas2[i] == Asn) ||
(aas1[i] == Gln && aas2[i] == Lys) ||
(aas1[i] == Lys && aas2[i] == Gln) )
continue;
return false;
}
return (aas1[i]==aas2[i]);
}
void DeNovoRankScorer::create_training_data_for_complete_denovo_ranking(
const string& db_dir,
const string& correct_dir,
const string& mgf_list,
const double train_ratio,
const int max_num_pairs,
const int charge,
const int size_idx,
RankBoostDataset& train_ds,
RankBoostDataset& test_ds,
vector<string>* peptide_strings,
char *test_scan_file,
float ratio_denovo,
char *rerank_path,
int rerank_depth)
{
PeakRankModel *&peak_model = peak_prediction_models[model_type];
Config *config = model->get_config();
const mass_t tolerance_diff = config->get_tolerance()*0.5;
const int max_num_ppp_frags = 4;
bool print_sams=false;
if (model_type != 2)
{
cout << "Error: this training function is only intended for full de novo sequence samples!" << endl;
cout << "Need to set model type to 2, not " << model_type << endl;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -