📄 regexp.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: regexp.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:40:23 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7 * PRODUCTION * =========================================================================== *//* $Id: regexp.cpp,v 1000.2 2004/06/01 19:40:23 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Clifford Clausen * File Description: * C++ wrappers for Perl Compatible Regular Expression (pcre) library * * =========================================================================== */#include <ncbi_pch.hpp>#include <corelib/ncbi_limits.h>#include <util/regexp.hpp>#include <memory>#include <stdlib.h>BEGIN_NCBI_SCOPE////////////////////////////////////////////////////////////////////////////////// CRegexpException//class CRegexpException : public CException{public: enum EErrCode { eCompile }; virtual const char* GetErrCodeString(void) const { switch ( GetErrCode() ) { case eCompile: return "eCompile"; default: return CException::GetErrCodeString(); } } NCBI_EXCEPTION_DEFAULT(CRegexpException,CException);};////////////////////////////////////////////////////////////////////////////////// CRegexp//CRegexp::CRegexp(const string& pattern, TCompile flags) : m_NumFound(0){ const char *err; int err_offset; m_PReg = pcre_compile(pattern.c_str(), flags, &err, &err_offset, NULL); if (m_PReg == NULL) { NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" + pattern + "' failed: " + err); }}CRegexp::~CRegexp(){ (*pcre_free)(m_PReg);}void CRegexp::Set(const string& pattern, TCompile flags){ if (m_PReg != NULL) { (*pcre_free)(m_PReg); } const char *err; int err_offset; m_PReg = pcre_compile(pattern.c_str(), flags, &err, &err_offset, NULL); if (m_PReg == NULL) { NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" + pattern + "' failed: " + err); }}string CRegexp::GetSub(const string& str, size_t idx) const{ int start = m_Results[2 * idx]; int end = m_Results[2 * idx + 1]; if ((int)idx >= m_NumFound || start == -1 || end == -1) { return kEmptyStr; } return str.substr(start, end - start);}string CRegexp::GetMatch( const string& str, TSeqPos offset, size_t idx, TMatch flags, bool noreturn){ m_NumFound = pcre_exec(m_PReg, NULL, str.c_str(), (int)str.length(), (int)offset, flags, m_Results, (int)(kRegexpMaxSubPatterns +1) * 3); if ( noreturn ) { return kEmptyStr; } else { return GetSub(str, idx); }}////////////////////////////////////////////////////////////////////////////////// CRegexpUtil//CRegexpUtil::CRegexpUtil(const string& str) : m_Content(str), m_IsDivided(false), m_RangeStart(kEmptyStr), m_RangeEnd(kEmptyStr), m_Delimiter("\n"){ return;}void CRegexpUtil::SetRange( const string& addr_start, const string& addr_end, const string& delimiter){ m_RangeStart = addr_start; m_RangeEnd = addr_end; x_Divide(delimiter);}size_t CRegexpUtil::Replace( const string& search, const string& replace, CRegexp::TCompile compile_flags, CRegexp::TMatch match_flags, size_t max_replace){ if ( search.empty() ) { return 0; } size_t n_replace = 0; // Fill shure that string is not divided. x_Join(); // Compile regular expression. CRegexp re(search, compile_flags); size_t start_pos = 0; for (size_t count = 0; !(max_replace && count >= max_replace); count++) { // Match pattern. re.GetMatch(m_Content.c_str(), (int)start_pos, 0, match_flags, true); int num_found = re.NumFound(); if (num_found <= 0) { break; } // Substitute all subpatterns "$<digit>" to values in the "replace" // string. const int* result; string x_replace = replace; size_t pos = 0; for (;;) { // Find "$" pos = x_replace.find("$", pos); if (pos == NPOS) { break; } // Try to convert string after the "$" to number errno = 0; const char* startptr = x_replace.c_str() + pos + 1; char* endptr = 0; long value = strtol(startptr, &endptr, 10); if ( errno || endptr == startptr || !endptr || value < kMin_Int || value > kMax_Int) { // Format error, skip single "$". pos++; continue; } int n = (int)value; // Get subpattern value string subpattern; if ( n > 0 && n < num_found ) { result = re.GetResults(n); if (result[0] >= 0 && result[1] >= 0) { subpattern = m_Content.substr(result[0], result[1] - result[0]); } } // Check braces {$...} size_t sp_start = pos; size_t sp_end = endptr - x_replace.c_str(); if ( sp_start > 0 && x_replace[sp_start-1] == '{') { sp_start--; if ( sp_end < x_replace.length() && x_replace[sp_end] == '}') { sp_end++; } else { // Format error -- missed closed brace. sp_start++; } } // Replace $n with subpattern value. x_replace.replace(sp_start, sp_end - sp_start, subpattern); pos += subpattern.length(); } // Replace pattern with "x_replace". result = re.GetResults(0); m_Content.replace(result[0], result[1] - result[0], x_replace); n_replace++; start_pos = result[0] + x_replace.length(); } return n_replace;}size_t CRegexpUtil::ReplaceRange( const string& search, const string& replace, CRegexp::TCompile compile_flags, CRegexp::TMatch match_flags, CRegexpUtil::ERange process_inside, size_t max_replace ){ if ( search.empty() ) { return 0; } size_t n_replace = 0; // Split source string to parts by delimiter x_Divide(); // Flag which denote that current line is inside "range" bool inside = m_RangeStart.empty(); NON_CONST_ITERATE (list<string>, i, m_ContentList) { // Get new line string line = *i; // Check beginning of block [addr_re_start:addr_re_end] if ( !inside && !m_RangeStart.empty() ) { CRegexp re(m_RangeStart.c_str()); re.GetMatch(line.c_str(), 0, 0, 0, true); inside = (re.NumFound() > 0); } else { inside = true; } // Process current line if ( (inside && process_inside == eInside) || (!inside && process_inside == eOutside) ) { CRegexpUtil re(line); n_replace += re.Replace(search, replace, compile_flags, match_flags, max_replace); *i = re; } // Check ending of block [addr_re_start:addr_re_end] if ( inside && !m_RangeEnd.empty() ) { // Two addresses CRegexp re(m_RangeEnd.c_str()); re.GetMatch(line.c_str(), 0, 0, 0, true); inside = (re.NumFound() <= 0); } else { // One address -- process one current string only inside = false; } } return n_replace;}void CRegexpUtil::x_Divide(const string& delimiter){ string x_delimiter = delimiter.empty() ? m_Delimiter : delimiter; if ( m_IsDivided ) { if ( x_delimiter == m_Delimiter ) { return; } x_Join(); } m_ContentList.clear(); // Split source string to parts by delimiter size_t pos; size_t start_pos = 0; for (;;) { pos = m_Content.find(x_delimiter, start_pos); if (pos == NPOS) { m_ContentList.push_back(m_Content.substr(start_pos)); break; } else { m_ContentList.push_back(m_Content.substr(start_pos, pos - start_pos)); start_pos = pos + x_delimiter.length(); } } m_IsDivided = true; // Save delimiter for consecutive joining m_Delimiter = x_delimiter;}void CRegexpUtil::x_Join(void){ if ( m_IsDivided ) { m_Content = NStr::Join(m_ContentList, m_Delimiter); m_IsDivided = false; }}END_NCBI_SCOPE/* * =========================================================================== * $Log: regexp.cpp,v $ * Revision 1000.2 2004/06/01 19:40:23 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7 * * Revision 1.7 2004/05/17 21:06:02 gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.6 2003/11/07 17:16:23 ivanov * Fixed warnings on 64-bit Workshop compiler * * Revision 1.5 2003/11/07 13:39:56 ivanov * Fixed lines wrapped at 79th columns * * Revision 1.4 2003/11/06 16:13:04 ivanov * Added CRegexpUtil class. Some formal code rearrangement. * * Revision 1.3 2003/07/16 19:13:50 clausen * Added TCompile and TMatch * * Revision 1.2 2003/06/20 18:26:37 clausen * Switched to native regexp interface * * Revision 1.1 2003/06/03 14:46:23 clausen * Initial version * * =========================================================================== */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -