📄 stringset.cpp

📁 基于MFC和STL平台的字符串类
💻 CPP
字号:
// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
//    "A Multiple Substring Search Algorithm" in the June 2002
//    edition of C/C++ Users Journal.  This class is based on
//    the algorthim therein described, but extended to return
//    all strings and use MFC classes.


#include "StdAfx.H"
#include "StringSet.H"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//                       C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ )
    : m_apnFSM( new DWORD[wInitialWidth][128] ),
      m_nCurDim( wInitialWidth ),
      m_nUsedCols( 0 ),
      m_wMaxUsedState( 0 ),
      m_nCurTextChar( 0 )
{
    // Initialize all FSM entries to zero
    ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
    delete [] m_apnFSM ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
//    several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( ( pszWord != NULL )
      && InsertWord( pszWord, (WORD)(size() + 1) )
       )
    {
        std::string strWord(pszWord) ;
        push_back( strWord ) ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add a string
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const std::string & rstrWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( InsertWord( rstrWord.c_str(), (WORD)(size() + 1) ) )
    {
        push_back( rstrWord ) ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add multiple words, delimited with chars from pszDelims
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims )
{
    int nCount = 0 ;

    // Start with the entire string
    std::string strUnparsed( pszWords ),
                strToken ;

    // stop iterating when we find no more words following delimiters
    while ( ! strUnparsed.empty() )
    {
        // Extract the first token
        size_t nTokSize = _tcscspn( strUnparsed.c_str(), pszDelims ) ;
        strToken = strUnparsed.substr( 0, nTokSize ) ;
        if ( ! strToken.empty() )
        {
            if ( Add( strToken ) )  // add it to the FSM and the array
                nCount++ ;
            else
                break ;
        }

        // Now, move beyond the token just extracted and the delimiters
        int nTokenLen = strToken.length() ;
        if ( nTokenLen < strUnparsed.length() )
        {
            int nDelimLen = _tcsspn( &strUnparsed[nTokenLen], pszDelims ) ;
            nTokenLen += nDelimLen ;
            if ( nTokenLen < strUnparsed.length() )
                strUnparsed = strUnparsed.substr( nTokenLen ) ;
            else
                break ;
        }
        else
            break ;
    }

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords )
{
    int nCount = 0 ;

    // Start with the first word in the multi-string
    LPCTSTR pszCurWord = pszzWords ;
    if ( pszCurWord != NULL )
    {
        // stop iterating when we find a "word" beginning with null (the extra \0)
        while ( pszCurWord[0] != '\0' )
        {
            if ( Add( pszCurWord ) )  // add it to the FSM and the array
            {
                nCount++ ;

                // Position to the next word, one beyond the null terminator
                size_t nLen = _tcslen( pszCurWord ) ;
                // If this fails, the extra \0 was probably not there, as required
                pszCurWord = &pszzWords[nLen+1] ;
            }
            else
                break ;
        }
    }

#ifdef _DEBUG
    if ( ( nWords != nCount )
      && ( nWords > 0 )
       )
        TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
    UNUSED(nWords);
#endif

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of an array of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::vector<std::string> astrWords )
{
    size_t nCount = astrWords.size() ;
    reserve( size() + nCount ) ;

    for ( UINT nWord = 0 ; nWord < nCount ; nWord++ )
    {
        if ( ! Add( astrWords[nWord] ) )  // add it to the FSM and the array
            break ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a list of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::list<std::string> lstrWords )
{
    size_t nCount = lstrWords.size() ;
    reserve( size() + nCount ) ;

    // Iterate the string list
    std::list<std::string>::iterator it = lstrWords.begin() ;
    while ( it != lstrWords.end() )
    {
        if ( ! Add( *it++ ) )  // add it to the FSM and the array
            break ;
    }

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
//          put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex )
{
    bool bRetVal = false ;

    size_t nLen = _tcslen( pszWord ) ;
    if ( ( m_wMaxUsedState < MAXWORD )    // if we've got 65,535 states, something's wrong
      && SetColDim( m_nUsedCols + nLen )  // make sure enough columns are allocated
       )
    {
        WORD wCurState = 0 ;
        for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
        {
            int nIdxChar = (pszWord[nChar] & 0x7F) ;  // mask out char values above 127
            DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;  // the state, and possibly also an index
            WORD wState = LOWORD(dwEntry) ;  // extract only the state portion
            bool bNew = (wState == 0) ;      // no previous words have been here

            if ( bNew )
                wState = ++m_wMaxUsedState ;  // use a new state number

            // special case - last character of substring
            if ( nChar == ( nLen-1 ) )
            {
                // Put both the state number and the word index in the entry
                m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ;
                break ;
            }
            else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
                m_apnFSM[wCurState][nIdxChar] = wState ;

            wCurState = wState ;  // prepare for next iteration
        }

        m_nUsedCols += nLen ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// SetColDim
//          set the current width to at least nNewDim columns
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::SetColDim( size_t nNewDim )
{
    bool bRetVal = false ;

    // Determine if we don't yet have enough columns for the newest word 
    if ( m_nCurDim < nNewDim )
    {
        int nNewSize = ((nNewDim + 63) & 0xFFC0) ;  // set to next higher multiple of 64
        if ( nNewSize < MAXWORD )  // we're "limited" to 65,535 columns
        {
            // reallocate the FSM with the increased column width
            DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ;
            ASSERT((int)m_nUsedCols < nNewSize);
            // Copy the portion already used, and zero out the rest
            CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ;
            ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ;
            delete [] m_apnFSM ; // Get rid of old FSM buffer,
            m_apnFSM = apnTemp ; // and point to new one
            m_nCurDim = (WORD)nNewSize ;

            bRetVal = true ;
        }
    }
    else
        bRetVal = true ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindFirstIn( std::string strText, int & rnFirst )
{
    // The real work is done by FindNext, but we must initialize the starting
    //    character index and string buffer first
    m_nCurTextChar = 0 ;
    m_strSearch = strText ;

    return FindNext( rnFirst ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindNext( int & rnNext )
{
    WORD wCurState = 0 ; // start in state 0
    UINT nIdx ;
    UINT nStartChar = 0xFFFFFFFF ;
    size_t nLen = m_strSearch.length() ;

    // Start with the character following the last one examined
    for ( nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
    {
        WORD wPrevState = wCurState ;

        // if we are in state 0, save the index of the starting character,
        //   in case we must backtrack or for the return value
        if ( wCurState == 0 )
            nStartChar = nIdx ;

        // follow up the table states
        int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ;  // mask out char values above 127
        DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;

        // if we reach an entry with a high word (an index), we have found a full match
        //    return the word we found
        if ( dwEntry > MAXWORD )
        {
            int nIndex = HIWORD(dwEntry) - 1 ;
            ASSERT(nIndex < size());
            rnNext = nIndex ;
            m_nCurTextChar = nIdx + 1 ;  // set the index for the next time
            break ;
        }
        else
            wCurState = LOWORD(dwEntry) ;

        // have we followed a false lead?
        // if so, backtrack nIdx to position to continue search, with state reset to zero
        // set nIdx to nStartChar, and it will be incremented as the loop re-enters
        if ( ( wPrevState != 0 )
          && ( wCurState == 0 )
           )
        {
            nIdx = nStartChar ;
            nStartChar = 0xFFFFFFFF ;  // in case we're about to terminate the loop
        }
    }

    // return the index of the start of the word, or -1 if no word found
    return (( nIdx < nLen ) ? nStartChar : 0xFFFFFFFF) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::FindAllIn( std::string strText, CWordPosPairList & rlstrWords )
{
    rlstrWords.clear() ;

    // Iterate through all words, and put them in the list
    int  nWord = -1 ;
    UINT nPos = FindFirstIn( strText, nWord ) ;
    if ( nPos != 0xFFFFFFFF )
    {
        do
        {
            CWordPosPair posWord( nWord, nPos ) ;
            rlstrWords.push_back( posWord ) ;
            nPos = FindNext( nWord ) ;
        } while ( nPos != 0xFFFFFFFF ) ;
    }

    return rlstrWords.size() ;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -