📄 quickclustering.cpp
字号:
total_mismatched += n_mis;
if (n_mis>0)
clusters[i].print_cluster_peptides();
}
spec_idx = end_stage_one_idx + 1; // go back to the end of stage one
// update the file which holds the last mass clustered
ofstream last_mass_stream(last_good_mass_name.c_str(),ios::out);
last_mass_stream << fixed << setprecision(3) << max_m_over_z << endl;
last_mass_stream.close();
}
if (sim_matrix)
delete [] sim_matrix;
cout << endl << endl << "Total spectra read and clustered: " << total_spectra_read << " (" <<
total_spectra << ")" << endl;
cout << "# spectra in clusters: " << total_spectra_in_clusters << endl;
cout << "% spectra in clusters: " << setprecision(3) <<
(double)total_spectra_in_clusters / (double)total_spectra << endl;
cout << "# clusters: " << num_clusters << " , " << "Avg cluster size: " <<
avg_cluster_size/(double)num_clusters << endl;
if (total_mismatched>0)
cout << "Total mismatched spectra: " << total_mismatched << " (" <<
(double)total_mismatched/total_spectra_read << ")" << endl;
// cluster histogram
cout << "Histogram of clusters: " << endl;
cout << "max size count" << endl;
int i;
for (i=0; i<num_clust_vals; i++)
{
cout << setw(8) << left << clust_vals[i] << clust_counts[i] << endl;
}
cout << ">" << setw(7) << left << clust_vals[num_clust_vals-1] <<
clust_counts[num_clust_vals] << endl;
double ratio = (double)total_spectra_in_clusters / (double)total_spectra;
if ((filter_prob_for_min_size_clusters<0.2 && ratio<0.45) ||
(filter_prob_for_min_size_clusters>0.2 && ratio < 0.4))
{
cout << endl << "WARNING: only " << fixed << setprecision(2) << ratio*100 << "%" <<
" of the spectra found there way into clusters. This might lead to loss of idnetifications." << endl;
cout << "If you are filtering, you might consider reducing the quality threshold to a lower value,";
cout << "e.g., by using the flag \"-min_filter_prob " << filter_prob*0.5 << endl;
}
}
/**********************************************************************
Creates cluster spec for the set of basic spectra.
First reads the spectra and copies the peaks into the bulk peak allocation
returns number of spectra actually read (does not read spectra that
were already assigned to a cluster).
The clustering is done in two phases. First a tight distance threshold
is implemented, and in the second phase it is relaxed (this way the clusters
should be more homegneous).
***********************************************************************/
int cluster_spec_in_file_set(Config *config,
const FileManager& fm,
FileSet& cluster_fs,
mass_t tolerance,
QCPeak *basic_peaks,
vector<ClusterSpectrum>& clusters,
float min_similarity,
int max_small_cluster_size,
int max_cluster_size,
int num_top_peaks_per_1000_da,
bool verbose,
void *pmcsqs_ptr,
float filter_prob,
map<mzXML_annotation,int> *ann_map_ptr)
{
// set clustering similarity thresholds
vector<float> similarity_vals;
int num_rounds;
if (min_similarity >= 0.9)
{
similarity_vals.push_back(min_similarity);
num_rounds=1;
}
else
{
similarity_vals.push_back(0.9);
if (min_similarity>=0.8)
{
similarity_vals.push_back(min_similarity);
num_rounds=2;
}
else
{
similarity_vals.push_back((min_similarity+0.9)/2.0);
similarity_vals.push_back(min_similarity);
num_rounds=3;
}
}
const float min_similarity_thresh = (min_similarity <0.2 ? min_similarity : 0.2); // don't test similarity if a previously recored // similarity between clusters is less than this value
PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;
BasicSpecReader bsr;
const int num_spectra = cluster_fs.get_total_spectra();
vector<SingleSpectrumFile *>& all_ssf = cluster_fs.get_non_const_ssf_pointers();
static vector<BasicSpectrum> basic_spectra;
int i;
// set max_small_cluster_size
if (max_small_cluster_size<0)
max_small_cluster_size = 10 + (int)(0.8*log(all_ssf.size()));
if (max_small_cluster_size > max_cluster_size)
max_small_cluster_size = max_cluster_size;
// read all the basic spectra into a central spectra repository
int total_peaks_read=0;
mass_t min_m_over_z = 1E7;
mass_t max_m_over_z = 0;
int num_filtered=0;
basic_spectra.clear();
basic_spectra.reserve(num_spectra);
for (i=0; i<num_spectra; i++)
{
if (! all_ssf[i]) // invalidated
continue;
const int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i], basic_peaks + total_peaks_read);
if (num_spec_peaks<5)
{
all_ssf[i]=NULL;
continue;
}
BasicSpectrum bs;
bs.num_peaks = num_spec_peaks;
bs.peaks = basic_peaks + total_peaks_read;
bs.ssf = all_ssf[i];
if (pmcsqs && bs.ssf->sqs<0)
{
int max_charge=0;
const float prob = pmcsqs->get_sqs_for_spectrum(config,bs,&max_charge);
if (prob<filter_prob)
{
if (ann_map_ptr)
{
DAT_single *dat_ssf = (DAT_single *)bs.ssf;
map<mzXML_annotation,int>::const_iterator it;
mzXML_annotation ann_pos;
ann_pos.mzXML_file_idx = dat_ssf->mzxml_file_idx;
ann_pos.scan = dat_ssf->scan_number;
it = ann_map_ptr->find(ann_pos);
if (it != ann_map_ptr->end())
{
cout << ">> " << ++wrongly_filtered_spectra << "\t" <<
dat_ssf->mzxml_file_idx << " " << dat_ssf->scan_number <<
" --> " << prob << endl;
}
}
all_ssf[i]=NULL; // this specturm was filtered!
num_filtered++;
continue;
}
// update m/z and charge state (yes it is supposed to be const...)
bs.ssf->charge = max_charge;
bs.ssf->sqs = prob;
}
basic_spectra.push_back(bs);
total_peaks_read += num_spec_peaks;
mass_t& m_over_z = bs.ssf->m_over_z;
if (m_over_z<min_m_over_z)
min_m_over_z = m_over_z;
if (m_over_z>max_m_over_z)
max_m_over_z = m_over_z;
}
// First stage, compare the basic spectra with clusters
// Use high similarity threshold
// If no cluster is found, create a new clusters for the spectrum
// the calculated simlarities are stored is the sim matrix and can be used
// in later stages to detect the need to re test the similarity
const float first_stage_sim = similarity_vals[0];
unsigned char * start_pos = sim_matrix;
static vector<int> idx_permutations;
idx_permutations.resize(basic_spectra.size());
for (i=0; i<basic_spectra.size(); i++)
idx_permutations[i]=i;
permute_vector(idx_permutations);
static vector<int> spec_top_idxs;
static float top_x_masses[NUM_TOP_CLUSTER_PEAKS];
for (i=0; i<basic_spectra.size(); i++)
{
const int spec_idx = idx_permutations[i];
BasicSpectrum& spec = basic_spectra[spec_idx];
const int spec_charge = spec.ssf->charge;
set_adjusted_inten(spec.peaks,spec.num_peaks);
select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
tolerance, spec_top_idxs, top_x_masses,
ClusterSpectrum::get_num_top_peaks_per_1000_da(), config);
// compare to previous clusters
int j;
for (j=0; j<clusters.size(); j++)
{
ClusterSpectrum& curr_clust = clusters[j];
if (! curr_clust.find_match_in_top_masses(top_x_masses))
{
mark_bit_zero(start_pos,j);
continue;
}
if (curr_clust.get_num_basic_spectra()>= max_cluster_size)
continue;
if (curr_clust.get_charge()>0 && spec_charge>0 && curr_clust.get_charge() !=spec_charge)
{
mark_bit_zero(start_pos,j);
continue;
}
const float sim = calc_selected_dot_prod(tolerance,
spec.peaks,spec.num_peaks, spec_top_idxs,
curr_clust.get_peaks_pointer(),curr_clust.get_num_peaks(),
curr_clust.get_top_ranked_idxs(),verbose);
if (sim >= min_similarity_thresh)
{
mark_bit_one(start_pos,j);
}
else
mark_bit_zero(start_pos,j);
// add this spectrum to an existing cluster
if (sim > first_stage_sim)
{
curr_clust.add_spectrum_to_cluster(spec, spec_top_idxs,top_x_masses);
break;
}
}
if (j<clusters.size()) // we added the spectrum to an existing cluster
continue;
// create new cluster from spectrum
clusters.resize(clusters.size()+1);
ClusterSpectrum& cs = clusters[clusters.size()-1];
cs.create_new_cluster(config, spec, clusters.size()-1);
cs.set_charge(spec.ssf->charge);
cs.set_top_ranked_idxs(spec_top_idxs);
cs.set_top_masses(top_x_masses);
cs.set_sim_matrix_row_start(start_pos);
// round off the start position to the next byte
unsigned char *old = start_pos;
start_pos += ((j+7) >> 3);
}
// second stage try joining clusters
// first start with the joining larger clusters
// use lower threshold
int round;
for (round=1; round<num_rounds; round++)
{
const float round_similarity = similarity_vals[round];
const float tighter_similarity = 1.0 - (1.0 - similarity_vals[round])/2.0;
// join larger clusters, use tighter similaarity
for (i=clusters.size()-1; i>0; i--)
{
if (clusters[i].get_tmp_cluster_idx()<0)
continue;
if (clusters[i].get_num_basic_spectra()>=max_cluster_size)
continue;
unsigned char *sim_row_start = clusters[i].get_sim_matrix_row_start();
const int num_spec_i = clusters[i].get_num_basic_spectra();
int j;
for (j=i-1; j>=0; j--)
{
const int size_sum = clusters[j].get_num_basic_spectra() + num_spec_i;
if (clusters[j].get_tmp_cluster_idx()<0 ||
size_sum <= max_small_cluster_size ||
size_sum >= max_cluster_size)
continue;
// skip 32 places if the matirx is all zeros in that area
if ((j % 32 == 31) && ! get_matrix_32_bits(sim_row_start,j))
{
j-=31;
continue;
}
if (! get_matrix_val(sim_row_start,j))
continue;
const float sim = calc_selected_dot_prod(tolerance,
clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(),
clusters[j].get_top_ranked_idxs(),
clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(),
clusters[i].get_top_ranked_idxs());
if (sim >= min_similarity_thresh)
{
mark_bit_one(sim_row_start,j);
}
else
mark_bit_zero(sim_row_start,j);
if (sim > tighter_similarity)
{
if (clusters[j].add_cluster(clusters[i], tighter_similarity))
{
clusters[i].set_tmp_cluster_idx(-1);
break;
}
}
}
}
// join smaller clusters, use the round similarity
for (i=clusters.size()-1; i>0; i--)
{
if (clusters[i].get_tmp_cluster_idx()<0 ||
clusters[i].get_num_basic_spectra() >= max_small_cluster_size)
continue;
unsigned char * sim_row_start = clusters[i].get_sim_matrix_row_start();
const int num_spec_i = clusters[i].get_num_basic_spectra();
int j;
for (j=i-1; j>=0; j--)
{
if (clusters[j].get_tmp_cluster_idx()<0 ||
num_spec_i + clusters[j].get_num_basic_spectra() >= max_small_cluster_size)
continue;
// skip 32 places if the matirx is all zeros in that area
if ((j % 32 == 31) && ! get_matrix_32_bits(sim_row_start,j))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -