📄 filemanagement.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	dat_files.resize(num_dat);
	ms2_files.resize(num_ms2);

	int mgf_c=0;
	int dta_c=0;
	int mzxml_c=0;
	int dat_c=0;
	int ms2_c=0;

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;
		if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' || 
			list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
			last_pos--;

		if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' && 
			list[i][last_pos  ]=='a')
		{
			dta_files[dta_c].single_name = list[i];

			if (quick_flag)
			{
				dta_files[dta_c].scan_dta(list[i],config);
			}
			else
				dta_files[dta_c].initial_read(config);

			dta_files[dta_c].type = DTA;
			dta_files[dta_c].file_idx =  dta_c;

			if (dta_files[dta_c].org_pm_with_19 < min_spec_mass)
				min_spec_mass = dta_files[dta_c].org_pm_with_19;

			if (dta_files[dta_c].org_pm_with_19 > max_spec_mass)
				max_spec_mass = dta_files[dta_c].org_pm_with_19;

			if (dta_files[dta_c].charge < min_charge)
				min_charge = dta_files[dta_c].charge;

			if (dta_files[dta_c].charge > max_charge)
				max_charge = dta_files[dta_c].charge;

			num_spectra[dta_files[dta_c].charge]++;

			dta_c++;
		}	
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
			list[i][last_pos  ]=='f')
		{
			mgf_files[mgf_c].mgf_name =list[i];

//			cout << "DD1 reading idx " << mgf_c << endl;
			int mgf_file_idx = (file_idx>=0 ? mgf_c + file_idx : mgf_c);

			mgf_files[mgf_c].initial_read(config,mgf_file_idx, quick_flag);

			if (mgf_files[mgf_c].min_spec_mass<min_spec_mass)
				min_spec_mass = mgf_files[mgf_c].min_spec_mass;

			if (mgf_files[mgf_c].max_spec_mass>max_spec_mass)
				max_spec_mass = mgf_files[mgf_c].max_spec_mass;

			if (mgf_files[mgf_c].min_charge < min_charge)
				min_charge = mgf_files[mgf_c].min_charge;

			if (mgf_files[mgf_c].max_charge > max_charge)
				max_charge = mgf_files[mgf_c].max_charge;

			int c;
			for (c=0; c<=max_charge; c++)
				num_spectra[c] += mgf_files[mgf_c].num_spectra[c];

			mgf_c++;	
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			mzxml_files[mzxml_c].mzxml_name =list[i];

			int mzxml_file_idx = (file_idx>=0 ? mzxml_c + file_idx : mzxml_c);

			mzxml_files[mzxml_c].initial_read(config,mzxml_c);

			if (mzxml_files[mzxml_c].min_spec_mass<min_spec_mass)
				min_spec_mass = mzxml_files[mzxml_c].min_spec_mass;

			if (mzxml_files[mzxml_c].max_spec_mass>max_spec_mass)
				max_spec_mass = mzxml_files[mzxml_c].max_spec_mass;

			if (mzxml_files[mzxml_c].min_charge < min_charge)
				min_charge = mzxml_files[mzxml_c].min_charge;

			if (mzxml_files[mzxml_c].max_charge > max_charge)
				max_charge = mzxml_files[mzxml_c].max_charge;

	//		int c;
	//		for (c=0; c<=max_charge; c++)
	//			num_spectra[c] += mzxml_files[mzxml_c].num_spectra[c];

			mzxml_c++;	
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				 list[i][last_pos  ]=='t')
		{
			dat_files[dat_c].dat_name =list[i];

			int dat_file_idx = (file_idx>=0 ? dat_c + file_idx : dat_c);

			dat_files[dat_c].initial_read(config,dat_file_idx);

			if (dat_files[dat_c].min_spec_mass<min_spec_mass)
				min_spec_mass = dat_files[dat_c].min_spec_mass;

			if (dat_files[dat_c].max_spec_mass>max_spec_mass)
				max_spec_mass = dat_files[dat_c].max_spec_mass;

			if (dat_files[dat_c].min_charge < min_charge)
				min_charge = dat_files[dat_c].min_charge;

			if (dat_files[dat_c].max_charge > max_charge)
				max_charge = dat_files[dat_c].max_charge;

			int c;
			for (c=0; c<=max_charge; c++)
				num_spectra[c] += dat_files[dat_c].num_spectra[c];

			dat_c++;	
		}
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' && 
			list[i][last_pos  ]=='2')
		{
			ms2_files[ms2_c].ms2_name =list[i];

			ms2_files[ms2_c].initial_read(config,ms2_c, quick_flag);

			if (ms2_files[ms2_c].min_spec_mass<min_spec_mass)
				min_spec_mass = ms2_files[ms2_c].min_spec_mass;

			if (ms2_files[ms2_c].max_spec_mass>max_spec_mass)
				max_spec_mass = ms2_files[ms2_c].max_spec_mass;

			if (ms2_files[ms2_c].min_charge < min_charge)
				min_charge = ms2_files[ms2_c].min_charge;

			if (mgf_files[ms2_c].max_charge > max_charge)
				max_charge = ms2_files[ms2_c].max_charge;

			int c;
			for (c=0; c<=max_charge; c++)
				num_spectra[c] += ms2_files[ms2_c].num_spectra[c];

			ms2_c++;	
		}
		else
		{
			cout << "Error :: couldn't recognize file type for: " << list[i] << endl;
			exit(1);
		}
    }

	count_num_spectra();
//	print_summary_stats();
}


/*****************************************************************
 returns how many spectra are present in the list file
 also samples m_over_z values to generate an approximate
 histogram in case the set of spectra needs to be spilt.
******************************************************************/
int FileManager::count_num_spectra(Config *config, const char* list_file,
						  vector<mass_t>& mass_histogram) const
{
	vector<string> list;
	read_paths_into_list(list_file,list);

	int i,num_spectra_read=0;
	vector<mass_t> masses;
	int set_size = 1000;    // number of spectra that are examined for mass collection before fulsing

	masses.clear();
	mass_histogram.clear();

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;
		if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' || 
			list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
			last_pos--;

		if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' && 
			list[i][last_pos  ]=='a')
		{
			DTA_file dta_file;
			masses.push_back(dta_file.m_over_z);
			num_spectra_read++;
		}	
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
			list[i][last_pos  ]=='f')
		{
			MGF_file mgf_file;
			mgf_file.mgf_name =list[i];

			mgf_file.initial_read(config,0,true);

			int j;
			for (j=0; j<mgf_file.single_spectra.size(); j++)
				masses.push_back(mgf_file.single_spectra[j].m_over_z);

			num_spectra_read+=mgf_file.single_spectra.size();
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			MZXML_file mzxml_file;
			mzxml_file.mzxml_name =list[i];

			mzxml_file.initial_read(config,0);

			int j;
			for (j=0; j<mzxml_file.single_spectra.size(); j++)
				masses.push_back(mzxml_file.single_spectra[j].m_over_z);

			num_spectra_read+=mzxml_file.single_spectra.size();
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				list[i][last_pos  ]=='t')
		{
			DAT_file dat_file;
			dat_file.dat_name =list[i];

			dat_file.initial_read(config,0);

			int j;
			for (j=0; j<dat_file.single_spectra.size(); j++)
				masses.push_back(dat_file.single_spectra[j].m_over_z);

			num_spectra_read+=dat_file.single_spectra.size();
		}
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' && 
				list[i][last_pos  ]=='2')
		{
			MS2_file ms2_file;
			ms2_file.ms2_name =list[i];

			ms2_file.initial_read(config,0);

			int j;
			for (j=0; j<ms2_file.single_spectra.size(); j++)
				masses.push_back(ms2_file.single_spectra[j].m_over_z);

			num_spectra_read+=ms2_file.single_spectra.size();
		}
		else
		{
			cout << "Error: couldn't recognize file type for: " << list[i] << endl;
			exit(1);
		}


		// if enough spectra were read, sample their masses and clear buffer
		if (masses.size()>set_size)
		{
			// sample masses per set size
			int num_sample = (int)((masses.size()/set_size)*10 + 0.5);
			int j;

			for (j=0; j<num_sample; j++)
			{
				int idx = (int)(my_random() * masses.size());
				mass_histogram.push_back(masses[idx]);
			}
		
			masses.clear();
		}

		if (mass_histogram.size() == 100000)
		{
			vector<mass_t> tmp_his;
			int j;

			for (j=0; j<set_size; j++)
				if (my_random()>=0.5)
					tmp_his.push_back(mass_histogram[j]);

			mass_histogram = tmp_his;
			tmp_his.clear();
			set_size *= 2;
		}
    }


	return num_spectra_read;
}


// counts how many total spectra are available from each charge
void FileManager::count_num_spectra()
{
	num_spectra.clear();
	num_spectra.resize(20,0);
	min_charge =99;
	max_charge =-1;


	int i;
	for (i=0; i<dta_files.size(); i++)
	{
		int charge =dta_files[i].charge;
		num_spectra[charge]++;
		if (charge<min_charge)
			min_charge = charge;
		if (charge>max_charge)
			max_charge = charge;
	}

	for (i=0; i<mgf_files.size(); i++)
	{
		int j;
		for (j=0; j<mgf_files[i].single_spectra.size(); j++)
		{
			int charge =mgf_files[i].single_spectra[j].charge;
			num_spectra[charge]++;
			if (charge<min_charge)
				min_charge = charge;
			if (charge>max_charge)
				max_charge = charge;
		}
	}

	for (i=0; i<mzxml_files.size(); i++)
	{
		int j;
		for (j=0; j<mzxml_files[i].single_spectra.size(); j++)
		{
			int charge =mzxml_files[i].single_spectra[j].charge;
			num_spectra[charge]++;
			if (charge<min_charge)
				min_charge = charge;
			if (charge>max_charge)
				max_charge = charge;
		}
	}

	for (i=0; i<dat_files.size(); i++)
	{
		int j;
		for (j=0; j<dat_files[i].single_spectra.size(); j++)
		{
			int charge =dat_files[i].single_spectra[j].charge;
			num_spectra[charge]++;
			if (charge<min_charge)
				min_charge = charge;
			if (charge>max_charge)
				max_charge = charge;
		}
	}

	total_num_spectra=0;
	for (i=0; i<num_spectra.size(); i++)
		total_num_spectra += num_spectra[i];

}



// Inits the FileManager using mass levels (for very large
// collections of spectra). This initialization uses a quick scan
void FileManager::init_from_list_file(Config *config, const char* list_file,
		mass_t min_m_over_z, mass_t max_m_over_z)
{
	int i;
	vector<string> list;
	read_paths_into_list(list_file,list);

	num_spectra.resize(20,0);

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;
		if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' || 
			list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
			last_pos--;

		if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' && 
			list[i][last_pos  ]=='a')
		{
			DTA_file dta_file;

			dta_file.scan_dta(list[i],config);

			if (dta_file.m_over_z>=min_m_over_z && dta_file.m_over_z<max_m_over_z)
				dta_files.push_back(dta_file);
			
		}	
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
			list[i][last_pos  ]=='f')
		{
			MGF_file mgf_file;
			mgf_file.mgf_name =list[i];

			mgf_file.initial_read(config,mgf_files.size(),true);

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<MGF_single> good_singles;
			int j;
			for (j=0; j<mgf_file.single_spectra.size(); j++)
			{
				if (mgf_file.single_spectra[j].m_over_z >= min_m_over_z && 
					mgf_file.single_spectra[j].m_over_z <  max_m_over_z)
				{
					good_singles.push_back(mgf_file.single_spectra[j]);
				}
			}

			mgf_file.single_spectra = good_singles;
			mgf_files.push_back(mgf_file);
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			MZXML_file mzxml;
			mzxml.mzxml_name =list[i];

			mzxml.initial_read(config,mzxml_files.size());

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<MZXML_single> good_singles;
			int j;
			for (j=0; j<mzxml.single_spectra.size(); j++)
			{
				if (mzxml.single_spectra[j].m_over_z >= min_m_over_z && 
					mzxml.single_spectra[j].m_over_z <  max_m_over_z)
				{
					good_singles.push_back(mzxml.single_spectra[j]);
				}
			}

			mzxml.single_spectra = good_singles;
			mzxml_files.push_back(mzxml);
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				list[i][last_pos  ]=='t')
		{
			DAT_file dat;
			dat.dat_name =list[i];

			dat.initial_read(config,dat_files.size());

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<DAT_single> good_singles;
			int j;
			for (j=0; j<dat.single_spectra.size(); j++)
			{
				if (dat.single_spectra[j].m_over_z >= min_m_over_z && 
					dat.single_spectra[j].m_over_z <  max_m_over_z)
				{
					good_singles.push_back(dat.single_spectra[j]);
				}
			}

			dat.single_spectra = good_singles;
			dat_files.push_back(dat);
		}
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' && 
			list[i][last_pos  ]=='2')
		{
			MS2_file ms2_file;
			ms2_file.ms2_name =list[i];

			ms2_file.initial_read(config,ms2_files.size(),true);

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<MS2_single> good_singles;
			int j;
			for (j=0; j<ms2_file.single_spectra.size(); j++)
			{
				if (ms2_file.single_spectra[j].m_over_z >= min_m_over_z && 
					ms2_file.single_spectra[j].m_over_z <  max_m_over_z)
				{
					good_singles.push_back(ms2_file.single_spectra[j]);
				}
			}

			ms2_file.single_spectra = good_singles;
			ms2_files.push_back(ms2_file);
		}
		else
		{
			cout << "Error: couldn't recognize file type for:: " << list[i] << endl;
			exit(1);
		}
	}

	count_num_spectra();
}


// Inits the FileManager using mass levels (for very large
// collections of spectra). This initialization uses a quick scan
// Only for files whose idx is true in file indicators
void FileManager::init_from_list_file(Config *config, const char* list_file,
		const vector<bool>& file_indicators)
{
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -