📄 denovoranktrain.cpp
字号:
#include "DeNovoRankScore.h"
#include "DeNovoSolutions.h"
#include "auxfun.h"
bool add_peptide_set_to_psm(PeptideSetMap& psm, PeptideSet& pep_set)
{
PeptideSetMap::iterator it;
scan_pair key(pep_set.file_idx,pep_set.scan);
it=psm.find(key);
if (it == psm.end())
{
psm.insert(pair<scan_pair,PeptideSet>(key,pep_set));
return false;
}
PeptideSet& set_in_map = (*it).second;
const int corr_aa_before = set_in_map.correct_sol.pep.get_aa_before();
const int corr_aa_after = set_in_map.correct_sol.pep.get_aa_after();
int i;
for (i=0; i<pep_set.incorrect_sols.size(); i++)
{
PeptideSolution& sol = pep_set.incorrect_sols[i];
// check if sol is already there
int j;
for (j=0; j<set_in_map.incorrect_sols.size(); j++)
if (sol.pep == set_in_map.incorrect_sols[j].pep)
break;
// if (j<set_in_map.incorrect_sols.size())
// continue;
if (sol.pep.get_aa_before()<0 && sol.pep.get_aa_after()<0)
{
sol.pep.set_aa_before(corr_aa_before);
sol.pep.set_aa_after(corr_aa_after);
}
set_in_map.incorrect_sols.push_back(sol);
}
return true;
}
void read_correct_seqs(const string& path,
vector<string>& seqs,
vector<char>& n_aas,
vector<char>& c_aas,
vector<float>& mqscores)
{
ifstream ifs(path.c_str());
if (! ifs.is_open())
{
cout << "Error: couldn't find file with correct sequences: " << path << endl;
exit(1);
}
seqs.clear();
n_aas.clear();
c_aas.clear();
while (! ifs.eof())
{
char buff[128];
string pep_seq;
char n_aa='$',c_aa='$';
float mqscore=NEG_INF;
ifs.getline(buff,128);
if (ifs.gcount()<5)
break;
istringstream iss(buff);
iss >> pep_seq >> n_aa >> c_aa >> mqscore;
if (n_aa == '$' || c_aa == '$' || pep_seq.length()<4 || pep_seq.length()>100)
{
cout << "Error: bad line:|" << buff << "|" << endl;
exit(1);
}
seqs.push_back(pep_seq);
n_aas.push_back(n_aa);
c_aas.push_back(c_aa);
mqscores.push_back(mqscore);
}
ifs.close();
}
/************************************************************************
*************************************************************************/
void add_db_hits_to_psm(Config *config,
const vector<string>& names,
const string& db_dir,
const string& correct_seq_dir,
const string& file_suffix,
int file_charge,
PeptideSetMap& psm)
{
const vector<int>& char2aa = config->get_char2aa();
int num_read=0;
int total_num_sets=0;
int total_num_peptides=0;
int file_idx;
for (file_idx=0; file_idx<names.size(); file_idx++)
{
string db_file = db_dir + "/" + names[file_idx] + file_suffix;
ifstream ifs(db_file.c_str());
if (! ifs.is_open())
continue;
cout << file_idx << "\t" << db_file << endl;
num_read++;
int num_sets=0;
int num_peptides=0;
int num_with_correct=0;
int num_first_correct=0;
vector<string> correct_seqs;
vector<char> correct_n_aas, correct_c_aas;
vector<float> mqscores;
string seq_path = correct_seq_dir + "/" + names[file_idx] + ".txt";
read_correct_seqs(seq_path,correct_seqs, correct_n_aas, correct_c_aas, mqscores);
while (! ifs.eof())
{
char buff[2048];
ifs.getline(buff,2048);
if (ifs.gcount()>2046)
cout << "Warning: buffer size?" << endl;
if (ifs.gcount()<5)
continue;
istringstream iss(buff);
int scan=-1;
mass_t mass_with_19 = -1;
int num_seqs = -1;
iss >> scan >> mass_with_19 >> num_seqs;
if (scan<0 ||
mass_with_19<0 ||
mass_with_19>10000 ||
num_seqs<0 ||
num_seqs>100)
{
cout << scan << "\t" <<mass_with_19 << "\t" << num_seqs << endl;
cout << "Error parsing line in file " << db_file << endl << "LINE:" <<buff << endl;
exit(1);
}
if (scan>correct_seqs.size())
{
cout << "Error: mismatch in correct seqs file!" << endl;
exit(1);
}
const string& correct_pep_str = correct_seqs[scan];
bool has_correct = false;
bool correct_first = false;
PeptideSet set;
set.file_idx=file_idx;
set.scan = scan;
set.correct_sol.charge = file_charge;
set.correct_sol.type = SOL_CORRECT;
set.correct_sol.pep.parse_from_string(config,correct_pep_str);
set.correct_sol.pep.set_aa_before(char2aa[correct_n_aas[scan]]);
set.correct_sol.pep.set_aa_after(char2aa[correct_c_aas[scan]]);
set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
set.correct_sol.reaches_n_terminal = true;
set.correct_sol.reaches_c_terminal = true;
set.correct_sol.MQScore= mqscores[scan];
vector<string> inserted_strings;
inserted_strings.clear();
int i;
for (i=0; i<num_seqs; i++)
{
char char_before, char_after;
int charge=0;
float mqscore;
string peptide_str;
iss >> char_before >> char_after >> charge >> mqscore >> peptide_str;
const int aa_before = char2aa[char_before];
const int aa_after = char2aa[char_after];
if (charge<=0 || charge>5)
{
cout << "Error: bad charge: " << charge << " in line:" << endl << buff << endl;
exit(1);
}
int j;
for (j=0; j<peptide_str.length(); j++)
if (peptide_str[j]=='I')
peptide_str[j]='L' ;
// check if peptide is same as correct don't add it
if (peptide_str == correct_pep_str)
{
has_correct = true;
if (i==0)
correct_first = true;
continue;
}
for (j=0; j<inserted_strings.size(); j++)
if (inserted_strings[j] == peptide_str)
break;
if (j<inserted_strings.size())
continue;
inserted_strings.push_back(peptide_str);
PeptideSolution sol;
sol.charge = charge;
sol.pep.parse_from_string(config,peptide_str);
sol.pep.set_aa_before(aa_before);
sol.pep.set_aa_after(aa_after);
sol.MQScore = mqscore;
sol.num_correct_aas = set.correct_sol.pep.calc_number_of_correct_aas(config,sol.pep);
sol.type= SOL_INCORRECT_DB;
sol.pm_with_19 = sol.pep.get_mass_with_19();
sol.reaches_c_terminal=true;
sol.reaches_n_terminal=true;
set.incorrect_sols.push_back(sol);
num_peptides++;
}
if (add_peptide_set_to_psm(psm,set))
total_num_sets++;
num_sets++;
if (has_correct)
num_with_correct++;
if (correct_first)
num_first_correct++;
}
ifs.close();
cout << "\tsets added: " << num_sets << endl;
cout << "\thad correct: " << float(num_with_correct)/num_sets << endl;
cout << "\tcorrect first:" << float(num_first_correct)/num_sets << endl;
cout << endl;
}
cout << "Processed " << num_read << "/" << names.size() << " dbh files." << endl;
}
void add_denovo_dbh_to_psm(Config *config,
const vector<string>& names,
const string& denovo_dir,
const string& correct_seq_dir,
const string& file_suffix,
int file_charge,
PeptideSetMap& psm)
{
const vector<int>& char2aa = config->get_char2aa();
int num_read=0;
int total_num_peptides=0;
int file_idx;
for (file_idx=0; file_idx<names.size(); file_idx++)
{
string dnv_file = denovo_dir + "/" + names[file_idx] + file_suffix;
ifstream ifs(dnv_file.c_str());
if (! ifs.is_open())
continue;
cout << file_idx << "\t" << dnv_file << endl;
num_read++;
int num_new_sets=0;
int num_sets=0;
int num_with_correct=0;
int num_first_correct=0;
vector<string> correct_seqs;
vector<char> correct_n_aas, correct_c_aas;
vector<float> mqscores;
string seq_path = correct_seq_dir + "/" + names[file_idx] + ".txt";
read_correct_seqs(seq_path,correct_seqs, correct_n_aas, correct_c_aas, mqscores);
while (! ifs.eof())
{
char buff[2048];
ifs.getline(buff,2048);
if (ifs.gcount()>2046)
cout << "Warning: buffer size?" << endl;
if (ifs.gcount()<5)
continue;
istringstream iss(buff);
int scan=-1;
mass_t mass_with_19 = -1;
int num_seqs = -1;
iss >> scan >> mass_with_19 >> num_seqs;
if (scan<0 ||
mass_with_19<0 ||
mass_with_19>10000 ||
num_seqs<0 ||
num_seqs>100)
{
cout << scan << "\t" <<mass_with_19 << "\t" << num_seqs << endl;
cout << "Error parsing line in file " << dnv_file << endl << "LINE:" <<buff << endl;
exit(1);
}
if (scan>correct_seqs.size())
{
cout << "Error: mismatch in correct seqs file!" << endl;
exit(1);
}
const string& correct_pep_str = correct_seqs[scan];
bool has_correct = false;
bool correct_first = false;
PeptideSet set;
set.file_idx=file_idx;
set.scan = scan;
set.correct_sol.charge = file_charge;
set.correct_sol.type = SOL_CORRECT;
set.correct_sol.pep.parse_from_string(config,correct_pep_str);
set.correct_sol.pep.set_aa_before(char2aa[correct_n_aas[scan]]);
set.correct_sol.pep.set_aa_after(char2aa[correct_c_aas[scan]]);
set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
set.correct_sol.reaches_n_terminal = true;
set.correct_sol.reaches_c_terminal = true;
set.correct_sol.MQScore= mqscores[scan];
vector<string> inserted_strings;
inserted_strings.clear();
int i;
for (i=0; i<num_seqs; i++)
{
char char_before, char_after;
int charge=0;
float mqscore;
string peptide_str;
iss >> char_before >> char_after >> charge >> mqscore >> peptide_str;
const int aa_before = char2aa[char_before];
const int aa_after = char2aa[char_after];
if (charge<=0 || charge>5)
{
cout << "Error: bad charge: " << charge << " in line:" << endl << buff << endl;
exit(1);
}
int j;
for (j=0; j<peptide_str.length(); j++)
if (peptide_str[j]=='I')
peptide_str[j]='L' ;
// check if peptide is same as correct don't add it
if (peptide_str == correct_pep_str)
{
has_correct = true;
if (i==0)
correct_first = true;
continue;
}
for (j=0; j<inserted_strings.size(); j++)
if (inserted_strings[j] == peptide_str)
break;
if (j<inserted_strings.size())
continue;
inserted_strings.push_back(peptide_str);
PeptideSolution sol;
sol.charge = charge;
sol.pep.parse_from_string(config,peptide_str);
sol.pep.set_aa_before(aa_before);
sol.pep.set_aa_after(aa_after);
sol.MQScore = mqscore;
sol.num_correct_aas = set.correct_sol.pep.calc_number_of_correct_aas(config,sol.pep);
sol.type= SOL_INCORRECT_DENOVO;
sol.pm_with_19 = sol.pep.get_mass_with_19();
sol.reaches_c_terminal=true;
sol.reaches_n_terminal=true;
set.incorrect_sols.push_back(sol);
}
if (! add_peptide_set_to_psm(psm,set))
num_new_sets++;
num_sets++;
if (has_correct)
num_with_correct++;
if (correct_first)
num_first_correct++;
}
ifs.close();
cout << "\tnew sets added : " << num_new_sets << endl;
cout << "\tsets added : " << num_sets << endl;
cout << "\thad correct : " << float(num_with_correct)/num_sets << endl;
cout << "\tcorrect first : " << float(num_first_correct)/num_sets << endl;
cout << endl;
}
cout << "Processed " << num_read << "/" << names.size() << " dbh files." << endl;
}
void add_denovo_paths_to_psm(Config *config,
const vector<string>& names,
const string& denovo_dir,
const string& file_suffix,
int file_charge,
PeptideSetMap& psm)
{
const vector<int>& char2aa = config->get_char2aa();
int num_read=0;
int total_num_sets=0;
int total_num_peptides=0;
int file_idx;
for (file_idx=0; file_idx<names.size(); file_idx++)
{
string denovo_file = denovo_dir + "/" + names[file_idx] + file_suffix;
ifstream ifs(denovo_file.c_str());
if (! ifs.is_open())
continue;
cout << file_idx << "\t" << denovo_file << endl;
num_read++;
int num_sets=0;
int num_with_correct=0;
int num_first_correct=0;
while (! ifs.eof())
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -