📄 qcdat.cpp
字号:
// cout << "MOVED!" << endl;
}
}
else
scan_start_ptr = Buffer;
BytesToRead = XML_BUFFER_SIZE - BufferEnd;
BytesRead = fread(Buffer + BufferEnd, sizeof(char), BytesToRead, MZXMLFile);
if (BytesRead<5)
break;
BufferEnd += BytesRead;
Buffer[BufferEnd] = '\0';
FilePos += BytesRead;
}
// Look for a new <scan tag opening:
// this scan cannot be done with strstr since there might be NULL termination
const char *last_pos = Buffer + BufferEnd - 5;
char *pos = scan_start_ptr;
while (++pos<last_pos)
{
if (*pos != '<')
continue;
if (*(pos+1)=='s' && *(pos+2)=='c' && *(pos+3)=='a' && *(pos+4)=='n')
break;
}
ScanStr = (pos<last_pos) ? pos : NULL;
if (ScanStr)
{
// if this is the case, read over more data to avoid the case where
// the spectrum's record is not all in the buffer
if (scan_start_ptr - Buffer > XML_BUFFER_HALF_SIZE)
{
scan_start_ptr = ScanStr-2;
continue;
}
BufferPos = ScanStr - Buffer;
}
else
{
BufferPos = 0;
}
if (!ScanStr )
{
scan_start_ptr = Buffer + BufferEnd-5;
continue;
}
ScanNumberStr = strstr(ScanStr, "num=");
if (!ScanNumberStr)
{
// printf("** Warning: mzXML parser encountered a scan with no scan number! File %s Pos %d\n",
// mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
ScanNumber = -1;
}
else
{
ScanNumber = ParseIntFromXML(ScanNumberStr);
scan_start_ptr = ScanNumberStr;
}
retentionTimeStr = strstr(ScanStr,"retentionTime=\"PT");
if (! retentionTimeStr)
{
retentionTime = -1;
}
else
{
retentionTime = ParseMassFromXML(retentionTimeStr);
}
char *PeakCountStr = strstr(ScanStr, "peaksCount=\"");
if (!PeakCountStr)
{
cout << "Warning: bad parsing peaks in mzxml. " << endl;
cout << "Scan: " << ScanNumber << " Pos: " << FilePos << endl;
scan_start_ptr += 50;
continue;
}
int PeakCount = ParseIntFromXML(PeakCountStr);
MSLevelStr = strstr(ScanStr, "msLevel=");
if (!MSLevelStr)
{
printf("** Warning: mzXML parser encountered a scan with no scan level! File %s Pos %d\n",
mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
scan_start_ptr += 50;
continue;
}
else
{
MSLevel = ParseIntFromXML(MSLevelStr);
}
if (MSLevel<=1)
{
scan_start_ptr += 50;
continue;
}
precursorIntensityStr = strstr(ScanStr,"precursorIntensity=");
if (! precursorIntensityStr)
{
cout << "Warning: no precursor intensity found for scan " << ScanNumber << " pos: " << BufferPos << endl;
scan_start_ptr += 50;
continue;
}
else
{
precursorIntensity = ParseMassFromXML(precursorIntensityStr);
}
PrecursorStr = strstr(ScanStr, "<precursorMz");
if (PrecursorStr)
{
PrecursorStr = strstr(PrecursorStr, ">");
PrecursorMZ = ParseMassFromXML(PrecursorStr);
}
if (!PrecursorStr && MSLevel > 1)
{
printf("Warning: mzXML parser encountered a scan with no m/z: File %s Pos %d\n",
mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
scan_start_ptr += 50;
continue;
}
// check that this is a good spectrum to output
if (MSLevel>1 && PeakCount>2 && PrecursorMZ>= min_m_over_z && PrecursorMZ <= max_m_over_z)
{
// read peaks
PeakStr = strstr(PrecursorStr, "<peaks");
if (PeakStr)
{
// Get byte order:
ByteOrderStr = strstr(PeakStr, "byteOrder=\"");
if (ByteOrderStr)
{
ByteOrderStr += 11;
if (!strncmp(ByteOrderStr, "network", 7))
{
ByteOrderLittle = 0;
}
if (!strncmp(ByteOrderStr, "big", 3))
{
ByteOrderLittle = 0;
}
if (!strncmp(ByteOrderStr, "little", 6))
{
ByteOrderLittle = 1;
}
}
PeakStr = strstr(PeakStr, ">");
}
if (!PeakStr)
{
cout << "Warning couldn't find peaks tag for scan: " << ScanNumber << " Buffer pos: " << BufferPos << " skipping..." << endl;
scan_start_ptr += 50;
continue;
}
PeakStr++;
PeakBuffer = PeakStr;
if (PeakBufferSize < PeakCount)
{
if (DecodedPeakBuffer)
{
char *dbf = DecodedPeakBuffer;
free(DecodedPeakBuffer);
DecodedPeakBuffer = NULL;
free(Peaks);
Peaks = NULL;
free(FilteredPeaks);
FilteredPeaks=NULL;
}
PeakBufferSize = (int)(PeakCount*1.5);
DecodedPeakBuffer = (char*)calloc(PeakBufferSize * 8 + 8, 1);
Peaks = (float*)calloc(PeakBufferSize * 2, sizeof(float));
FilteredPeaks = (float*)calloc(PeakBufferSize * 2, sizeof(float));
}
Trail = (PeakCount % 3);
if (!(PeakCount % 3))
{
PeakBuffer[PeakCount * 32/3] = '\0';
}
else
{
PeakBuffer[(PeakCount * 32/3) + Trail + 1] = '\0';
}
b64_decode_mio( DecodedPeakBuffer, PeakBuffer);
for (FloatIndex = 0; FloatIndex < (2 * PeakCount); FloatIndex++)
{
#ifdef BYTEORDER_LITTLE_ENDIAN
if (!ByteOrderLittle)
{
char ByteSwap = DecodedPeakBuffer[FloatIndex*4];
DecodedPeakBuffer[FloatIndex*4] = DecodedPeakBuffer[FloatIndex*4 + 3];
DecodedPeakBuffer[FloatIndex*4 + 3] = ByteSwap;
ByteSwap = DecodedPeakBuffer[FloatIndex*4 + 1];
DecodedPeakBuffer[FloatIndex*4 + 1] = DecodedPeakBuffer[FloatIndex*4 + 2];
DecodedPeakBuffer[FloatIndex*4 + 2] = ByteSwap;
}
memcpy(Peaks + FloatIndex, DecodedPeakBuffer + FloatIndex * 4, 4);
#else
if (ByteOrderLittle)
{
char ByteSwap = DecodedPeakBuffer[FloatIndex*4];
DecodedPeakBuffer[FloatIndex*4] = DecodedPeakBuffer[FloatIndex*4 + 3];
DecodedPeakBuffer[FloatIndex*4 + 3] = ByteSwap;
ByteSwap = DecodedPeakBuffer[FloatIndex*4 + 1];
DecodedPeakBuffer[FloatIndex*4 + 1] = DecodedPeakBuffer[FloatIndex*4 + 2];
DecodedPeakBuffer[FloatIndex*4 + 2] = ByteSwap;
}
memcpy(Peaks + FloatIndex, DecodedPeakBuffer + FloatIndex * 4, 4);
#endif
}
// add spectrum
int DAT_file_idx = (int)(PrecursorMZ/mass_increment);
if (DAT_file_idx > max_dat_file_idx)
DAT_file_idx = max_dat_file_idx;
if (! dat_buffs[DAT_file_idx].ind_was_initialized)
{
ostringstream os, os_batch;
os << DAT_file_idx;
os_batch << batch;
string path = out_dir + "/" + name + "_" + os_batch.str() + "_" + os.str() + ".dat";
dat_buffs[DAT_file_idx].init(path,dat_buff_size);
}
// join and filter peaks
int new_num_peaks = join_and_filter_peak_list(config,PrecursorMZ,Peaks,
PeakCount,FilteredPeaks);
if (new_num_peaks>3)
{
dat_buffs[DAT_file_idx].add_spec_to_DAT_file(PrecursorMZ, 0, file_idx, ScanNumber, retentionTime,
precursorIntensity, new_num_peaks, (char *)FilteredPeaks);
spec_counter++;
}
scan_start_ptr = PeakStr + 8 * PeakCount;
}
else
scan_start_ptr = ScanStr +50;
}
fclose(MZXMLFile);
cout << spec_counter << " spectra..." << endl << endl;
return spec_counter;
}
void create_annotated_mgf_from_dat(Config *config,
char *dat_list,
char *mzxml_list,
char *anns_file,
char *output_file)
{
vector<string> list, mzxml_names;
read_paths_into_list(dat_list,list);
vector< vector<int> > annotation_idxs;
vector<mzXML_annotation> annotations;
read_mzXML_annotations(mzxml_list,anns_file, annotation_idxs, annotations);
FileManager fm;
fm.init_from_list_file_and_add_annotations(config,dat_list,annotation_idxs,annotations,true);
FileSet fs;
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
BasicSpecReader bsr;
int i;
ofstream mgf_stream(output_file);
QCPeak peaks[5000];
for (i=0; i<all_ssf.size(); i++)
{
int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],peaks);
BasicSpectrum bs;
bs.num_peaks = num_spec_peaks;
bs.peaks = peaks;
bs.ssf = all_ssf[i];
bs.ssf->charge = 2;
DAT_single *dat_single = (DAT_single *)all_ssf[i];
char single_name[256];
sprintf(single_name,"spec_%d_%d",dat_single->mzxml_file_idx,dat_single->scan_number);
bs.ssf->single_name = single_name;
bs.output_to_mgf(mgf_stream,config);
}
mgf_stream.close();
}
/*
void create_annotated_mgf_from_mgf(Config *config,
char *mgf_list,
char *anns_file,
char *output_file)
{
vector<string> list;
cout << "MGF: " << mgf_list << endl;
cout << "ANN: " << anns_file << endl;
cout << "OUT: " << output_file << endl;
read_paths_into_list(mgf_list,list);
cout << "working on " << list.size() << " paths" << endl;
vector< vector<int> > annotation_idxs;
vector<mzXML_annotation> annotations;
read_mzXML_annotations_limited(mgf_list,anns_file, annotation_idxs, annotations);
cout << "Annotations: " << annotations.size() << endl;
FileManager fm;
fm.init_from_list_file_and_add_annotations(config,mgf_list,annotation_idxs,annotations,true);
FileSet fs;
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
BasicSpecReader bsr;
int i;
ofstream mgf_stream(output_file);
QCPeak peaks[5000];
for (i=0; i<all_ssf.size(); i++)
{
int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],peaks,false,true);
BasicSpectrum bs;
bs.num_peaks = num_spec_peaks;
bs.peaks = peaks;
bs.ssf = all_ssf[i];
bs.ssf->charge = 2;
MGF_single *mgf_single = (MGF_single *)all_ssf[i];
char single_name[256];
sprintf(single_name,"spec_%d_%d",mgf_single->file_idx,mgf_single->idx_in_file);
bs.ssf->single_name = single_name;
bs.output_to_mgf(mgf_stream,config);
}
mgf_stream.close();
}
*/
int parse_single_MZXML_file_print_peaks(Config *config,
string& mzxml_name,
int the_scan)
{
static char* Buffer = NULL;
static char* DecodedPeakBuffer = NULL;
static float* Peaks = NULL;
static float* FilteredPeaks = NULL;
static int PeakBufferSize = 0;
int Trail;
static char* PrecursorStr;
int FloatIndex;
char* ByteOrderStr;
int ByteOrderLittle = 1;
int BytesToRead;
int BufferStartPos = 0;
int BytesRead;
int BufferEnd = 0;
FILE* MZXMLFile;
int ParseState = 0;
int FilePos = 0;
// allocate
if (! Buffer)
Buffer = (char*)calloc(XML_BUFFER_SIZE + 1, sizeof(char));
MZXMLFile = fopen(mzxml_name.c_str(), "rb");
if (!MZXMLFile)
{
cout << "Error: Can't open MZXML file " << mzxml_name << endl;
exit(1);
}
int idx_in_file = 0;
int spec_counter =0;
char *scan_start_ptr = NULL;
while (1)
{
char* ScanStr;
char* ScanNumberStr;
int ScanNumber;
char* MSLevelStr;
int MSLevel;
char *PrecursorStr;
mass_t PrecursorMZ;
char *retentionTimeStr;
float retentionTime;
char *precursorIntensityStr;
float precursorIntensity;
char* PeakStr;
char* PeakBuffer;
int BufferPos;
// Read more data, to fill up the buffer:
if ( ! scan_start_ptr ||
( (Buffer + BufferEnd - scan_start_ptr) < XML_BUFFER_HALF_SIZE) )
{
// try shunt half of the buffer
if (scan_start_ptr)
{
if (BufferEnd - XML_BUFFER_HALF_SIZE>0)
{
memmove(Buffer, Buffer + XML_BUFFER_HALF_SIZE, BufferEnd - XML_BUFFER_HALF_SIZE);
BufferEnd -= XML_BUFFER_HALF_SIZE;
scan_start_ptr -= XML_BUFFER_HALF_SIZE;
// cout << "MOVED!" << endl;
}
}
else
scan_start_ptr = Buffer;
BytesToRead = XML_BUFFER_SIZE - BufferEnd;
BytesRead = fread(Buffer + BufferEnd, sizeof(char), BytesToRead, MZXMLFile);
if (BytesRead<5)
break;
BufferEnd += BytesRead;
Buffer[BufferEnd] = '\0';
// remove all '\0' from buffer (these cause parsing errors.
// replace them with ' '
// const char *end_ptr = Buffer + BufferEnd - 2;
// for (char *ptr=Buffer; ptr<end_ptr; ptr++)
// if (*ptr == '\0')
// *ptr = ' ';
FilePos += BytesRead;
}
// Look for a new <scan tag opening:
// this scan cannot be done with strstr since there might be NULL termination
const char *last_pos = Buffer + BufferEnd - 5;
char *pos = scan_start_ptr;
while (++pos<last_pos)
{
if (*pos != '<')
continue;
if (*(pos+1)=='s' && *(pos+2)=='c' && *(pos+3)=='a' && *(pos+4)=='n')
break;
}
ScanStr = (pos<last_pos) ? pos : NULL;
if (ScanStr)
{
// if this is the case, read over more data to avoid the case where
// the spectrum's record is not all in the buffer
if (scan_start_ptr - Buffer > XML_BUFFER_HALF_SIZE)
{
scan_start_ptr = ScanStr-2;
continue;
}
BufferPos = ScanStr - Buffer;
}
else
{
BufferPos = 0;
}
if (!ScanStr )
{
scan_start_ptr = Buffer + BufferEnd-5;
continue;
}
ScanNumberStr = strstr(ScanStr, "num=");
if (!ScanNumberStr)
{
// printf("** Warning: mzXML parser encountered a scan with no scan number! File %s Pos %d\n",
// mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
ScanNumber = -1;
}
else
{
ScanNumber = ParseIntFromXML(ScanNumberStr);
scan_start_ptr = ScanNumberStr;
}
retentionTimeStr = strstr(ScanStr,"retentionTime=\"PT");
if (! retentionTimeStr)
{
// printf("Error: mzXML parser encountered a scan with no retnetion time: File %s Pos %d\n",
// mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
// cout <<"SCANSTR:"<<endl << ScanStr << endl;
// exit(1);
retentionTime = -1;
}
else
{
retentionTime = ParseMassFromXML(retentionTimeStr);
}
char *PeakCountStr = strstr(ScanStr, "peaksCount=\"");
if (!PeakCountStr)
{
cout << "Warning: bad parsing peaks in mzxml. " << endl;
cout << "Scan: " << ScanNumber << " Pos: " << FilePos << endl;
scan_start_ptr += 50;
continue;
// exit(1);
}
int PeakCount = ParseIntFromXML(PeakCountStr);
MSLevelStr = strstr(ScanStr, "msLevel=");
if (!MSLevelStr)
{
printf("** Warning: mzXML parser encountered a scan with no scan level! File %s Pos %d\n",
mzxml_name.c_str(), FilePos + BufferPos - BufferEnd);
scan_start_ptr += 50;
continue;
}
else
{
MSLevel = ParseIntFromXML(MSLevelStr);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -