📄 quickclustering.cpp
字号:
{
j-=31;
continue;
}
if ( ! get_matrix_val(sim_row_start,j))
continue;
if (clusters[j].get_num_basic_spectra()>max_cluster_size)
continue;
const float sim = calc_selected_dot_prod(tolerance,
clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(),
clusters[j].get_top_ranked_idxs(),
clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(),
clusters[i].get_top_ranked_idxs());
if (sim >= min_similarity_thresh)
{
mark_bit_one(sim_row_start,j);
}
else
mark_bit_zero(sim_row_start,j);
if (sim > round_similarity)
{
if (clusters[j].add_cluster(clusters[i],round_similarity))
{
clusters[i].set_tmp_cluster_idx(-1);
break;
}
}
}
}
}
// cout << "Filtered " << num_filtered << "/" << num_spectra << endl;
if (verbose)
{
for (i=0; i<clusters.size(); i++)
if (clusters[i].get_tmp_cluster_idx() >=0 &&
clusters[i].get_basic_spectra().size() == 1 &&
clusters[i].get_basic_spectra()[0].ssf->peptide.get_num_aas()>0)
cout << clusters[i].get_basic_spectra()[0].ssf->peptide.as_string(config) << endl;
}
return num_spectra;
}
/**********************************************************************
Adds spectra from the additional set to the existing clusters.
If they are added they are invalidated from further clusetering.
***********************************************************************/
int add_additional_spectra_to_existing_clusters(
Config *config,
const FileManager& fm,
FileSet& additional_fs,
mass_t tolerance,
QCPeak *basic_peaks,
vector<ClusterSpectrum>& clusters,
float min_similarity,
int max_cluster_size,
void *pmcsqs_ptr,
float filter_prob,
map<mzXML_annotation,int> *ann_map_ptr,
bool verbose)
{
float spectrum_join_similarity = 0.8;
if (min_similarity>spectrum_join_similarity)
spectrum_join_similarity = min_similarity;
// read spectra
BasicSpecReader bsr;
const int num_spectra = additional_fs.get_total_spectra();
vector<SingleSpectrumFile *>& all_ssf = additional_fs.get_non_const_ssf_pointers();
static vector<BasicSpectrum> basic_spectra;
basic_spectra.clear();
basic_spectra.reserve(num_spectra);
PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;
int i,total_peaks_read=0;
mass_t min_m_over_z = 1E7;
mass_t max_m_over_z = 0;
for (i=0; i<num_spectra; i++)
{
if (! all_ssf[i] || all_ssf[i]->assigned_cluster>=0)
continue;
const int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],
basic_peaks + total_peaks_read);
if (num_spec_peaks<5)
{
all_ssf[i]=NULL;
continue;
}
BasicSpectrum bs;
bs.num_peaks = num_spec_peaks;
bs.peaks = basic_peaks + total_peaks_read;
bs.ssf = all_ssf[i];
if (pmcsqs && bs.ssf->sqs<0)
{
int max_charge=0;
const float prob = pmcsqs->get_sqs_for_spectrum(config,bs,&max_charge);
if (prob<filter_prob)
{
if (ann_map_ptr)
{
DAT_single *dat_ssf = (DAT_single *)bs.ssf;
map<mzXML_annotation,int>::const_iterator it;
mzXML_annotation ann_pos;
ann_pos.mzXML_file_idx = dat_ssf->mzxml_file_idx;
ann_pos.scan = dat_ssf->scan_number;
it = ann_map_ptr->find(ann_pos);
if (it != ann_map_ptr->end())
{
cout << ">> " << ++wrongly_filtered_spectra << "\t" <<
dat_ssf->mzxml_file_idx << " " << dat_ssf->scan_number <<
" --> " << prob << endl;
}
}
all_ssf[i]=NULL; // this specturm was filtered!
continue;
}
// update m/z and charge state (yes it is supposed to be const...)
//ssf->charge=max_charge;
//ssf->m_over_z = res[max_charge].mz1;
bs.ssf->sqs = prob;
}
basic_spectra.push_back(bs);
total_peaks_read += num_spec_peaks;
mass_t& m_over_z = bs.ssf->m_over_z;
if (m_over_z<min_m_over_z)
min_m_over_z = m_over_z;
if (m_over_z>max_m_over_z)
max_m_over_z = m_over_z;
}
// cout << "add: " << setw(8) << min_m_over_z << " - " << setw(8) << max_m_over_z << " : " <<
// setw(6) << basic_spectra.size() << " " << total_peaks_read << endl;
static vector<int> idx_permutations;
idx_permutations.resize(basic_spectra.size());
for (i=0; i<basic_spectra.size(); i++)
idx_permutations[i]=i;
permute_vector(idx_permutations);
// cluster the spectra
static float top_x_masses[NUM_TOP_CLUSTER_PEAKS];
static vector<int> spec_top_idxs;
int num_added=0;
for (i=0; i<basic_spectra.size(); i++)
{
const int spec_idx = idx_permutations[i];
BasicSpectrum& spec = basic_spectra[spec_idx];
const float spec_retention_time = spec.ssf->retention_time;
set_adjusted_inten(spec.peaks,spec.num_peaks);
select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
tolerance,spec_top_idxs, top_x_masses,
ClusterSpectrum::get_num_top_peaks_per_1000_da(),
config);
// compare to previous clusters
int j;
for (j=0; j<clusters.size(); j++)
{
if (clusters[j].get_tmp_cluster_idx() < 0)
continue;
if (! clusters[j].find_match_in_top_masses(top_x_masses))
continue;
if (clusters[j].get_num_basic_spectra()>=max_cluster_size)
continue;
const float sim = calc_selected_dot_prod(tolerance,
spec.peaks,spec.num_peaks, spec_top_idxs,
clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(),
clusters[j].get_top_ranked_idxs());
if (sim > spectrum_join_similarity)
{
clusters[j].add_spectrum_to_cluster(spec, spec_top_idxs, top_x_masses);
num_added++;
break;
}
}
}
return num_added;
}
void check_fs_for_missing_anns_and_test_sqs(Config *config,
const vector<SingleSpectrumFile *>& ssfs,
const FileManager& fm,
void *pmcsqs_ptr,
char *anns_file)
{
const mass_t tolerance = config->get_tolerance();
map<mzXML_annotation,int> ann_map, ssf_map;
read_mzXML_annotations_to_map(anns_file,ann_map);
int i;
for (i=0; i<ssfs.size(); i++)
{
DAT_single *dat_ssf = (DAT_single *)ssfs[i];
mzXML_annotation ann;
ann.charge = 0;
ann.scan = dat_ssf->scan_number;
ann.mzXML_file_idx = dat_ssf->mzxml_file_idx;
ssf_map.insert(make_pair(ann,i));
}
// check if anns are in map
int num_read=0;
int num_not_read=0;
vector<SingleSpectrumFile *> miss_ssfs;
map<mzXML_annotation,int>::const_iterator it;
for (it = ann_map.begin(); it != ann_map.end(); it++)
{
map<mzXML_annotation,int>::const_iterator ssf_it;
mzXML_annotation ann_pos;
ann_pos.mzXML_file_idx = it->first.mzXML_file_idx;
ann_pos.scan = it->first.scan;
ssf_it = ssf_map.find(ann_pos);
if (ssf_it != ssf_map.end())
{
num_read++;
miss_ssfs.push_back(ssfs[ssf_it->second]);
ssfs[ssf_it->second]->peptide.parse_from_string(config,it->first.pep);
}
else
num_not_read++;
}
cout << "NUM anns read in SSFs: " << num_read << endl;
cout << "NUM in the twilight zone: " << num_not_read << endl;
PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;
BasicSpecReader bsr;
vector<QCPeak> peaks;
peaks.resize(6000);
QCOutputter qco;
qco.init("missed_xxx","out",0);
int num_with_little_peaks=0;
for (i=0; i<miss_ssfs.size(); i++)
{
const int num_spec_peaks = bsr.read_basic_spec(config,fm,miss_ssfs[i],
&peaks[0]);
if (num_spec_peaks<3)
{
num_with_little_peaks++;
continue;
}
BasicSpectrum bs;
bs.num_peaks = num_spec_peaks;
bs.peaks = &peaks[0];
bs.ssf = miss_ssfs[i];
if (pmcsqs && bs.ssf->sqs<0)
{
static QCPeak *tmp_peaks = NULL;
static int num_tmp_peaks = 0;
if (! tmp_peaks || bs.num_peaks>num_tmp_peaks)
{
if (tmp_peaks)
delete [] tmp_peaks;
num_tmp_peaks = (int)(bs.num_peaks * 1.5 + 1);
tmp_peaks = new QCPeak[num_tmp_peaks];
}
// need to create a special peak list that passes filtering
QCPeak *org_peaks = bs.peaks;
int num_org_peaks = bs.num_peaks;
BasicSpectrum sqs_bs = bs;
sqs_bs.peaks = tmp_peaks;
sqs_bs.ssf = bs.ssf;
const mass_t join_tolerance = (tolerance < 0.05 ? tolerance : 0.6 * tolerance);
int p_idx=0;
int j=1;
while (j<num_org_peaks)
{
if (org_peaks[j].mass - org_peaks[p_idx].mass<=join_tolerance)
{
intensity_t inten_sum = org_peaks[i].intensity + org_peaks[p_idx].intensity;
mass_t new_mass = (org_peaks[j].intensity * org_peaks[j].mass +
org_peaks[p_idx].intensity * org_peaks[p_idx].mass ) / inten_sum;
org_peaks[p_idx].mass = new_mass;
org_peaks[p_idx].intensity = inten_sum;
}
else
{
org_peaks[++p_idx]=org_peaks[j];
}
j++;
}
num_org_peaks = p_idx+1;
vector<bool> indicators;
mark_top_peaks_with_sliding_window(bs.peaks,
bs.num_peaks,
config->get_local_window_size(),
config->get_max_number_peaks_per_local_window(),
indicators);
if (bs.ssf->precursor_intensity <=0)
{
float inten=0;
int j;
for (j=0; j<num_org_peaks; i++)
inten+=org_peaks[j].intensity;
bs.ssf->precursor_intensity=inten;
}
p_idx=0;
for (j=0; j<num_org_peaks; j++)
if (indicators[j])
tmp_peaks[p_idx++]=org_peaks[j];
sqs_bs.num_peaks = p_idx;
// cout << num_org_peaks << "->" << sqs_bs.num_peaks << endl;
int max_charge=0;
const float prob = pmcsqs->get_sqs_for_spectrum(config,sqs_bs,&max_charge);
cout << i << "\t" << setprecision(3) << prob << "\t";
bs.ssf->type = DAT;
bs.ssf->print_ssf_stats(config);
}
ClusterSpectrum cs;
cs.create_new_cluster(config,bs,i);
qco.output_cluster_spectrum(cs);
}
cout << "NUM WITH LITTLE PEAKS: " << num_with_little_peaks << endl;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -