📄 qcdat.cpp
字号:
#include "QuickClustering.h"
#include "base64.h"
DAT_FileBuff::~DAT_FileBuff()
{
if (buff && pos>buff)
flush_buff();
if (buff)
delete [] buff;
}
void DAT_FileBuff::init(string& _path, int dat_buff_size)
{
if (! ind_was_initialized)
{
buff = new char[dat_buff_size];
if (! buff)
{
cout << "Error: couldn't allocate memory for DAT file buff!" << endl;
exit(1);
}
}
ind_was_initialized = 1;
path = _path;
max_pos = buff + dat_buff_size - 100;
pos = buff;
ind_first_write = 1;
}
// copies the files to the DAT file
void DAT_FileBuff::add_spec_to_DAT_file(
mass_t m_over_z,
int charge,
int mzxml_file_idx,
int scan_number,
float retention_time,
float precursor_intensity,
int num_peaks,
char *peak_buff)
{
const int spec_bytes = sizeof(mass_t) + 4 * sizeof(int) + 2 * sizeof(float) * num_peaks;
if (num_peaks>5000 || num_peaks<0)
cout << mzxml_file_idx << " " << scan_number << " " << " p: " << num_peaks << endl;
if (mzxml_file_idx<0 || scan_number<0)
{
cout <<"Error: bad file idx or scan number! " << mzxml_file_idx << ", " << scan_number << endl;
exit(1);
}
if (pos + spec_bytes >= max_pos)
flush_buff();
mass_t *m_ptr = (mass_t *)pos;
*m_ptr++ = m_over_z;
int *i_ptr = (int *)m_ptr;
*i_ptr++ = charge;
*i_ptr++ = mzxml_file_idx;
*i_ptr++ = scan_number;
*i_ptr++ = num_peaks;
float *f_ptr = (float *)i_ptr;
*f_ptr++ = retention_time;
*f_ptr++ = precursor_intensity;
pos = (char *)f_ptr;
memcpy(pos,peak_buff,2 * sizeof(float) * num_peaks);
pos += 2 * sizeof(float) * num_peaks;
this->counter++;
}
void DAT_FileBuff::flush_buff()
{
FILE* dat_stream;
if (pos == buff)
return;
if (! ind_was_initialized)
{
cout << "Error: must first initialize the DAT_FileBuff!" << endl;
exit(1);
}
if (ind_first_write)
{
dat_stream=fopen(path.c_str(),"wb");
ind_first_write = 0;
}
else
dat_stream=fopen(path.c_str(),"ab");
if (! dat_stream)
{
cout << "Error: couldn't open DAT file for writing: " << path.c_str() << endl;
exit(1);
}
fwrite(buff,1,pos-buff,dat_stream);
fclose(dat_stream);
pos=buff;
}
void DAT_Converter::init_DAT_Converter(mass_t _max_m_over_z, mass_t _mass_increment, int _dat_buff_size)
{
// name = _name;
// out_dir = _out_dir;
max_m_over_z = _max_m_over_z;
mass_increment = _mass_increment;
dat_buff_size = _dat_buff_size;
max_dat_file_idx = (int)(max_m_over_z / mass_increment) + 1;
dat_buffs.resize(max_dat_file_idx+1);
ind_was_initialized =1;
}
// creates a file with the list of DAT files
void DAT_Converter::create_list_file() const
{
FILE *list_stream;
ostringstream oss;
oss << batch;
string list_path = out_dir + "/" + name + "_" + oss.str() + "_list.txt";
list_stream = fopen(list_path.c_str(),"w");
if (! list_stream)
{
cout << "Error: couldn't open list file for writing: " << list_path.c_str() << endl;
exit(1);
}
int i;
for (i=0; i<dat_buffs.size(); i++)
{
if (dat_buffs[i].buff)
{
fprintf(list_stream,"%s\n",dat_buffs[i].path.c_str());
}
}
fclose(list_stream);
}
int DAT_Converter::convert_single_non_MZXML_file_to_DAT(Config* config, string file,
mass_t min_m_over_z, mass_t max_m_over_z, int file_idx)
{
static QCPeak *peak_buff=NULL;
static float *dat_peak_buff=NULL;
if (! peak_buff)
{
peak_buff = new QCPeak[20000];
dat_peak_buff = new float[40000];
}
if (! peak_buff || ! dat_peak_buff)
{
cout << "Error: couldn't allocate memory for DAT conversion!" << endl;
exit(1);
}
if (! ind_was_initialized)
{
cout << "Error: must initialize DAT_Converter!" << endl;
exit(0);
}
vector<string> list;
list.push_back(file);
BasicSpecReader bsr;
FileManager fm;
FileSet all_spec_fs;
fm.init_from_list(config,list,true,file_idx);
all_spec_fs.select_all_files(fm);
const int total_spectra = all_spec_fs.get_total_spectra();
const vector<SingleSpectrumFile *>& all_ssf = all_spec_fs.get_ssf_pointers();
cout << file_idx << " " << file << " ... " << endl;
int num_spectra_extracted=0;
int i;
for (i=0; i<all_ssf.size(); i++)
{
SingleSpectrumFile *ssf = all_ssf[i];
// no small spectra
if (ssf->num_peaks<5 || ssf->m_over_z<=min_m_over_z || ssf->m_over_z > max_m_over_z)
continue;
int j,pos=0;
int num_spec_peaks = bsr.read_basic_spec(config,fm,ssf,peak_buff,true);
if (num_spec_peaks<5)
continue;
mass_t m_over_z = ssf->m_over_z;
int scan_number=-1, file_idx=-1;
float precursor_intensity=0;
if (ssf->type == DTA)
{
file_idx = 0;
scan_number=-1;
}
else if (ssf->type == MGF)
{
MGF_single *mgf_single = (MGF_single *)ssf;
file_idx = mgf_single->file_idx;
scan_number = mgf_single->idx_in_file;
}
else if (ssf->type == MZXML)
{
MZXML_single *mzxml_single = (MZXML_single *)ssf;
file_idx = mzxml_single->file_idx;
scan_number = mzxml_single->scan_number;
precursor_intensity = mzxml_single->precursor_intensity;
}
else if (ssf->type == DAT)
{
DAT_single *dat_single = (DAT_single *)ssf;
file_idx = dat_single->mzxml_file_idx;
scan_number = dat_single->scan_number;
}
// copy peaks
for (j=0; j<num_spec_peaks; j++)
{
dat_peak_buff[pos++]=(float)peak_buff[j].mass;
dat_peak_buff[pos++]=(float)peak_buff[j].intensity;
}
// add spectrum
int DAT_file_idx = (int)(m_over_z/mass_increment);
if (DAT_file_idx > max_dat_file_idx)
DAT_file_idx = max_dat_file_idx;
if (! dat_buffs[DAT_file_idx].ind_was_initialized)
{
ostringstream os,os_batch;
os << DAT_file_idx;
os_batch << batch;
string path = out_dir + "/" + name + "_" + os_batch.str() + "_" + os.str() + ".dat";
dat_buffs[DAT_file_idx].init(path,dat_buff_size);
}
int charge = ssf->charge;
if (charge<0 || charge>1000)
charge=0;
dat_buffs[DAT_file_idx].add_spec_to_DAT_file(
m_over_z,
charge,
file_idx,
scan_number,ssf->retention_time,
precursor_intensity,
num_spec_peaks,(char *)dat_peak_buff);
num_spectra_extracted++;
}
cout << num_spectra_extracted << " spectra..." << endl;
return num_spectra_extracted;
}
int DAT_Converter::convert_PKL_dir_to_DAT(Config* config, char *file_list, int file_start_idx,
mass_t min_m_over_z, mass_t max_m_over_z)
{
FILE *list_stream=fopen(file_list,"r");
if (! list_stream)
{
cout << "Error: couldn't open pkl dir list for reading: " << file_list << endl;
exit(1);
}
char line_buff[1024];
cout << "Read pkl files (idx path #spectra):" << endl;
int total_extracted=0;
int pkl_dir_idx=0;
while (fgets(line_buff,1024,list_stream))
{
istringstream is(line_buff);
string pkl_dir_path = "";
string tsv_file = "";
is >> pkl_dir_path >> tsv_file;
if (pkl_dir_path.length()>2 && tsv_file.length() >2)
{
FileManager fm;
fm.init_from_single_pkl_dir(config, pkl_dir_path, tsv_file, pkl_dir_idx,
min_m_over_z, max_m_over_z);
FileSet fs;
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
cout << pkl_dir_idx << "\t" << pkl_dir_path << "\t" << all_ssf.size() << endl;
static QCPeak *peak_buff=NULL;
static float *dat_peak_buff=NULL;
if (! peak_buff)
{
peak_buff = new QCPeak[20000];
dat_peak_buff = new float[40000];
}
if (! peak_buff || ! dat_peak_buff)
{
cout << "Error: couldn't allocate memory for DAT conversion!" << endl;
exit(1);
}
if (! ind_was_initialized)
{
cout << "Error: must initialize DAT_Converter!" << endl;
exit(0);
}
BasicSpecReader bsr;
int num_spectra_extracted=0;
int i;
for (i=0; i<all_ssf.size(); i++)
{
PKL_single *ssf = (PKL_single *)all_ssf[i];
// no small spectra
if (ssf->num_peaks<5 || ssf->m_over_z<=min_m_over_z || ssf->m_over_z > max_m_over_z)
continue;
int j,pos=0;
int num_spec_peaks = bsr.read_basic_spec(config,fm,ssf,peak_buff,true);
if (num_spec_peaks<5)
continue;
mass_t m_over_z = ssf->m_over_z;
int scan_number=ssf->scan_number;
float precursor_intensity= (ssf->precursor_intensity >0 ? ssf->precursor_intensity : 0);
// copy peaks
for (j=0; j<num_spec_peaks; j++)
{
dat_peak_buff[pos++]=(float)peak_buff[j].mass;
dat_peak_buff[pos++]=(float)peak_buff[j].intensity;
}
// add spectrum
int DAT_file_idx = (int)(m_over_z/mass_increment);
if (DAT_file_idx > max_dat_file_idx)
DAT_file_idx = max_dat_file_idx;
if (! dat_buffs[DAT_file_idx].ind_was_initialized)
{
ostringstream os,os_batch;
os << DAT_file_idx;
os_batch << batch;
string path = out_dir + "/" + name + "_" + os_batch.str() + "_" + os.str() + ".dat";
dat_buffs[DAT_file_idx].init(path,dat_buff_size);
}
int charge = ssf->charge;
if (charge<0 || charge>1000)
charge=0;
dat_buffs[DAT_file_idx].add_spec_to_DAT_file(
m_over_z,
charge,
pkl_dir_idx + file_start_idx,
scan_number,ssf->retention_time,
precursor_intensity,
num_spec_peaks,(char *)dat_peak_buff);
num_spectra_extracted++;
}
cout << "(" << num_spectra_extracted << ")" << endl;
total_extracted += num_spectra_extracted;
pkl_dir_idx++;
}
}
return total_extracted;
}
void DAT_Converter::convert_files_to_DAT_on_the_fly(Config* config, char *file_list,
char * _out_dir, char * _name, int _batch,
mass_t min_m_over_z,
mass_t max_m_over_z,
int file_start_idx,
bool ind_is_pkl_dir)
{
name = _name;
out_dir = _out_dir;
batch = _batch;
int num_spectra_extracted=0;
name = _name;
out_dir = _out_dir;
if (! ind_was_initialized)
{
cout << "Error: must initialize DAT_Converter!" << endl;
exit(0);
}
if (ind_is_pkl_dir)
{
num_spectra_extracted = convert_PKL_dir_to_DAT(config,file_list, file_start_idx, min_m_over_z,max_m_over_z);
}
else
{
vector<string> list;
read_paths_into_list(file_list,list);
cout << endl << endl <<"Extracting spectra and writing dat files for " << list.size() << " Files. " << endl << endl;
int i;
for (i=0; i<list.size(); i++)
{
int file_type = get_file_extension_type(list[i]);
if (file_type == MZXML)
{
num_spectra_extracted += parse_single_MZXML_file(config,list[i],file_start_idx+i,
min_m_over_z,max_m_over_z);
}
else
{
num_spectra_extracted += convert_single_non_MZXML_file_to_DAT(config,list[i],
min_m_over_z,max_m_over_z,file_start_idx+i);
}
}
}
int d;
for (d=0; d<dat_buffs.size(); d++)
{
if (dat_buffs[d].ind_was_initialized && dat_buffs[d].pos > dat_buffs[d].buff)
dat_buffs[d].flush_buff();
}
cout << "Total spectra extracted and converted to DAT: " << num_spectra_extracted << endl;
create_list_file();
}
int DAT_Converter::parse_single_MZXML_file(Config *config,
string& mzxml_name,
int file_idx,
mass_t min_m_over_z,
mass_t max_m_over_z)
{
static char* Buffer = NULL;
static char* DecodedPeakBuffer = NULL;
static float* Peaks = NULL;
static float* FilteredPeaks = NULL;
static int PeakBufferSize = 0;
int Trail;
static char* PrecursorStr;
int FloatIndex;
char* ByteOrderStr;
int ByteOrderLittle = 1;
int BytesToRead;
int BufferStartPos = 0;
int BytesRead;
int BufferEnd = 0;
FILE* MZXMLFile;
int ParseState = 0;
int FilePos = 0;
// allocate
if (! Buffer)
Buffer = (char*)calloc(XML_BUFFER_SIZE + 1, sizeof(char));
MZXMLFile = fopen(mzxml_name.c_str(), "rb");
if (!MZXMLFile)
{
cout << "Error: Can't open MZXML file " << mzxml_name << endl;
exit(1);
}
printf("File idx %d , '%s'...\n",file_idx, mzxml_name.c_str());
int idx_in_file = 0;
int spec_counter =0;
char *scan_start_ptr = NULL;
while (1)
{
char* ScanStr;
char* ScanNumberStr;
int ScanNumber;
char* MSLevelStr;
int MSLevel;
char *PrecursorStr;
mass_t PrecursorMZ;
char *retentionTimeStr;
float retentionTime;
char *precursorIntensityStr;
float precursorIntensity;
char* PeakStr;
char* PeakBuffer;
int BufferPos;
// Read more data, to fill up the buffer:
if ( ! scan_start_ptr ||
( (Buffer + BufferEnd - scan_start_ptr) < XML_BUFFER_HALF_SIZE) )
{
// try shunt half of the buffer
if (scan_start_ptr)
{
if (BufferEnd - XML_BUFFER_HALF_SIZE>0)
{
memmove(Buffer, Buffer + XML_BUFFER_HALF_SIZE, BufferEnd - XML_BUFFER_HALF_SIZE);
BufferEnd -= XML_BUFFER_HALF_SIZE;
scan_start_ptr -= XML_BUFFER_HALF_SIZE;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -