📄 phonemeextractor.cpp
字号:
{
CWordTag *word = outwords.m_Words[ i ];
if ( IsUseable( word ) )
continue;
count++;
}
return count;
}
// Keeps same relative spacing, but rebases list
void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
{
// Repartition phonemes based on old range
float oldRange = ( float )( oldEnd - oldStart );
float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
for ( int i = 0; i < word->m_Phonemes.Size(); i++ )
{
CPhonemeTag *tag = word->m_Phonemes[ i ];
Assert( tag );
float frac1 = 0.0f, frac2 = 0.0f;
float delta1, delta2;
delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
if ( oldRange > 0.0f )
{
frac1 = delta1 / oldRange;
frac2 = delta2 / oldRange;
}
tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
tag->m_uiEndByte = word->m_uiStartByte + ( unsigned int ) ( frac2 * newRange );
}
}
void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
{
int wordCount = end - start + 1;
Assert( wordCount >= 1 );
int stepSize = ( sampleEnd - sampleStart ) / wordCount;
int currentStart = sampleStart;
for ( int i = start; i <= end; i++ )
{
CWordTag *word = outwords.m_Words[ i ];
Assert( word );
unsigned int oldStart = word->m_uiStartByte;
unsigned int oldEnd = word->m_uiEndByte;
word->m_uiStartByte = currentStart;
word->m_uiEndByte = currentStart + stepSize;
RepartitionPhonemes( word, oldStart, oldEnd );
currentStart += stepSize;
}
}
void MergeWords( CWordTag *w1, CWordTag *w2 )
{
unsigned int start, end;
start = min( w1->m_uiStartByte, w2->m_uiStartByte );
end = max( w1->m_uiEndByte, w2->m_uiEndByte );
unsigned int mid = ( start + end ) / 2;
unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
oldw1start = w1->m_uiStartByte;
oldw2start = w2->m_uiStartByte;
oldw1end = w1->m_uiEndByte;
oldw2end = w2->m_uiEndByte;
w1->m_uiStartByte = start;
w1->m_uiEndByte = mid;
w2->m_uiStartByte = mid;
w2->m_uiEndByte = end;
RepartitionPhonemes( w1, oldw1start, oldw1end );
RepartitionPhonemes( w2, oldw2start, oldw2end );
}
void FixupZeroLengthWords( CSentence& outwords )
{
bool done = false;
while ( 1 )
{
int i;
for ( i = 0 ; i < outwords.m_Words.Size() - 1; i++ )
{
CWordTag *current, *next;
current = outwords.m_Words[ i ];
next = outwords.m_Words[ i + 1 ];
if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
{
MergeWords( current, next );
break;
}
if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
{
MergeWords( current, next );
break;
}
}
if ( i >= outwords.m_Words.Size() - 1 )
{
break;
}
}
}
void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
{
int numwords = outwords.m_Words.Size();
// Nothing to do
if ( numwords <= 0 )
return;
int interationcount = 1;
while( 1 )
{
Log( "\nCompute %i\n", interationcount++ );
LogWords( outwords );
int wordNumber;
// Done!
if ( !CountUnuseableWords( outwords ) )
{
FixupZeroLengthWords( outwords );
break;
}
if ( !CountUsableWords( outwords ) )
{
// Evenly space words across full sample time
PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
break;
}
wordNumber = FindFirstUsableWord( outwords );
// Not the first word
if ( wordNumber > 0 )
{
// Repartition all of the unusables and the first one starting at zero over the range
CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
Assert( firstUsable );
if ( firstUsable->m_uiStartByte != 0 )
{
PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
}
else
{
PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
}
// Start over
continue;
}
wordNumber = FindLastUsableWord( outwords );
// Not the last word
if ( wordNumber >= 0 && wordNumber < numwords - 1 )
{
// Repartition all of the unusables and the first one starting at zero over the range
CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
Assert( lastUsable );
if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
{
PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
}
else
{
PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
}
// Start over
continue;
}
// If we get here it means that the start and end of the list are okay and we just have to
// iterate across the list and fix things in the middle
int startByte = 0;
int endByte = 0;
for ( int i = 0; i < numwords ; i++ )
{
CWordTag *word = outwords.m_Words[ i ];
if ( IsUseable( word ) )
{
startByte = word->m_uiEndByte;
continue;
}
// Found the start of a chain of 1 or more unusable words
// Find the startbyte of the next usable word and count how many words we check
int wordCount = 1;
for ( int j = i + 1; j < numwords; j++ )
{
CWordTag *next = outwords.m_Words[ j ];
if ( IsUseable( next ) )
{
endByte = next->m_uiStartByte;
break;
}
wordCount++;
}
// Now partition words across the gap and go to start again
PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
break;
}
}
}
//-----------------------------------------------------------------------------
// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
// sample counts for the sentce
// Input : *wavfile -
// *inwords -
// *outphonemes{ text.Clear( -
// Output : SR_RESULT
//-----------------------------------------------------------------------------
static SR_RESULT SAPI_ExtractPhonemes(
const char *wavfile,
int numsamples,
void (*pfnPrint)( const char *fmt, ... ),
CSentence& inwords,
CSentence& outwords )
{
LogReset();
USES_CONVERSION;
CSpDynamicString text;
text.Clear();
HKEY hkwipe;
LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
if ( lResult == ERROR_SUCCESS )
{
RecursiveRegDelKey( hkwipe );
RegCloseKey( hkwipe );
}
if ( strlen( inwords.GetText() ) <= 0 )
{
inwords.SetTextFromWords();
}
// Construct a string from the inwords array
text.Append( T2W( inwords.GetText() ) );
// Assume failure
SR_RESULT result = SR_RESULT_ERROR;
if ( text.Length() > 0 )
{
CSentence sentence;
pfnPrint( "Processing...\r\n" );
// Give it a try
result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
pfnPrint( "Finished.\r\n" );
// PrintWordsAndPhonemes( sentence, pfnPrint );
// Copy results to outputs
outwords.Reset();
outwords.SetText( inwords.GetText() );
Log( "Starting\n" );
LogWords( inwords );
if ( SR_RESULT_ERROR != result )
{
int i;
Log( "Hypothesized\n" );
LogWords( sentence );
for( i = 0 ; i < sentence.m_Words.Size(); i++ )
{
CWordTag *tag = sentence.m_Words[ i ];
if ( tag )
{
// Skip '...' tag
if ( stricmp( tag->m_pszWord, "..." ) )
{
CWordTag *newTag = new CWordTag( *tag );
outwords.m_Words.AddToTail( newTag );
}
}
}
// Now insert unrecognized/skipped words from original list
//
int frompos = 0, topos = 0;
int iterationcount = 1;
while( 1 )
{
// End of source list
if ( frompos >= inwords.m_Words.Size() )
break;
const CWordTag *fromTag = inwords.m_Words[ frompos ];
// Reached end of destination list, just copy words over from from source list until
// we run out of source words
if ( topos >= outwords.m_Words.Size() )
{
// Just copy words over
CWordTag *newWord = new CWordTag( *fromTag );
// Remove phonemes
while ( newWord->m_Phonemes.Size() > 0 )
{
CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
newWord->m_Phonemes.Remove( 0 );
delete kill;
}
outwords.m_Words.AddToTail( newWord );
frompos++;
topos++;
continue;
}
// Destination word
const CWordTag *toTag = outwords.m_Words[ topos ];
// Words match, just skip ahead
if ( !stricmp( fromTag->m_pszWord, toTag->m_pszWord ) )
{
frompos++;
topos++;
continue;
}
// The only case we handle is that something in the source wasn't in the destination
// Find the next source word that appears in the destination
int skipAhead = frompos + 1;
bool found = false;
while ( skipAhead < inwords.m_Words.Size() )
{
const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
if ( !stricmp( sourceWord->m_pszWord, toTag->m_pszWord ) )
{
found = true;
break;
}
skipAhead++;
}
// Uh oh destination has words that are not in source, just skip to next destination word?
if ( !found )
{
topos++;
}
else
{
// Copy words from from source list into destination
//
int skipCount = skipAhead - frompos;
while ( --skipCount>= 0 )
{
const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
CWordTag *newWord = new CWordTag( *sourceWord );
// Remove phonemes
while ( newWord->m_Phonemes.Size() > 0 )
{
CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
newWord->m_Phonemes.Remove( 0 );
delete kill;
}
outwords.m_Words.InsertBefore( topos, newWord );
topos++;
}
frompos++;
topos++;
}
}
Log( "\nDone simple check\n" );
LogWords( outwords );
LogPhonemes( outwords );
ComputeMissingByteSpans( numsamples, outwords );
Log( "\nFinal check\n" );
LogWords( outwords );
LogPhonemes( outwords );
}
}
else
{
pfnPrint( "Input sentence is empty!\n" );
}
// Return results
return result;
}
//-----------------------------------------------------------------------------
// Purpose: Expose the interface
//-----------------------------------------------------------------------------
class CPhonemeExtractorSAPI : public IPhonemeExtractor
{
public:
virtual PE_APITYPE GetAPIType() const
{
return SPEECH_API_SAPI;
}
// Used for menus, etc
virtual char const *GetName() const
{
return "MS SAPI 5.1";
}
SR_RESULT Extract(
const char *wavfile,
int numsamples,
void (*pfnPrint)( const char *fmt, ... ),
CSentence& inwords,
CSentence& outwords )
{
return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
}
};
EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -