📄 phonemeextractor.cpp
字号:
// extracephonemes.cpp : Defines the entry point for the console application.
//
#define PROTECTED_THINGS_DISABLE
#include <stdio.h>
#include <windows.h>
#include <tchar.h>
#include "sphelper.h"
#include "spddkhlp.h"
// ATL Header Files
#include <atlbase.h>
// Face poser and util includes
#include "utlvector.h"
#include "PhonemeExtractor.h"
#include "PhonemeConverter.h"
#include "sentence.h"
#include "tier0/dbg.h"
// Extract phoneme grammar id
#define EP_GRAM_ID 101
// First rule of dynamic sentence rule set
#define DYN_SENTENCERULE 102
// # of milliseconds to allow for processing before timeout
#define SR_WAVTIMEOUT 4000
// Weight tag for rule to rule word/rule transitions
#define CONFIDENCE_WEIGHT 0.0f
//#define LOGGING 1
#define LOGFILE "c:\\fp.log"
void LogReset( void )
{
#if LOGGING
FILE *fp = fopen( LOGFILE, "w" );
if ( fp )
fclose( fp );
#endif
}
char *va( const char *fmt, ... );
//-----------------------------------------------------------------------------
// Purpose:
// Input : *words -
//-----------------------------------------------------------------------------
void LogWords( CSentence& sentence )
{
Log( "Wordcount == %i\n", sentence.m_Words.Size() );
for ( int i = 0; i < sentence.m_Words.Size(); i++ )
{
const CWordTag *w = sentence.m_Words[ i ];
Log( "Word %s %u to %u\n", w->m_pszWord, w->m_uiStartByte, w->m_uiEndByte );
}
}
//-----------------------------------------------------------------------------
// Purpose:
// Input : *phonemes -
//-----------------------------------------------------------------------------
void LogPhonemes( CSentence& sentence )
{
return;
Log( "Phonemecount == %i\n", sentence.CountPhonemes() );
for ( int i = 0; i < sentence.m_Words.Size(); i++ )
{
const CWordTag *w = sentence.m_Words[ i ];
for ( int j = 0; j < w->m_Phonemes.Size(); j++ )
{
const CPhonemeTag *p = w->m_Phonemes[ j ];
Log( "Phoneme %s %u to %u\n", p->m_szPhoneme, p->m_uiStartByte, p->m_uiEndByte );
}
}
}
#define NANO_CONVERT 10000000.0f;
//-----------------------------------------------------------------------------
// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
// FIXME: Right now, phonemes are assumed to evenly space out across a word.
// Input : *converter -
// result -
// sentence -
//-----------------------------------------------------------------------------
void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
{
USES_CONVERSION;
// Grab access to element container
ISpPhrase *phrase = ( ISpPhrase * )result;
if ( !phrase )
return;
SPPHRASE *pElements;
if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
return;
// Only use it if it's better/same size as what we already had on-hand
if ( pElements->Rule.ulCountOfElements > 0 )
//(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
{
sentence.ResetToBase();
// Walk list of words
for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
{
unsigned int wordstart, wordend;
// Get start/end sample index
wordstart = pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
wordend = wordstart + pElements->pElements[i].ulAudioSizeBytes;
// Create word tag
CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
Assert( w );
w->m_uiStartByte = wordstart;
w->m_uiEndByte = wordend;
sentence.AddWordTag( w );
// Count # of phonemes in this word
SPPHONEID pstr[ 2 ];
pstr[ 1 ] = 0;
WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
const SPPHONEID *current;
SPPHONEID phoneme;
current = pElements->pElements[i].pszPronunciation;
float total_weight = 0.0f;
while ( 1 )
{
phoneme = *current++;
if ( !phoneme )
break;
pstr[ 0 ] = phoneme;
wszPhoneme[ 0 ] = L'\0';
converter->IdToPhone( pstr, wszPhoneme );
total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
}
current = pElements->pElements[i].pszPronunciation;
// Decide # of bytes/phoneme weight
float psize = 0;
if ( total_weight )
{
psize = ( wordend - wordstart ) / total_weight;
}
int number = 0;
// Re-walk the phoneme list and create true phoneme tags
float startWeight = 0.0f;
while ( 1 )
{
phoneme = *current++;
if ( !phoneme )
break;
pstr[ 0 ] = phoneme;
wszPhoneme[ 0 ] = L'\0';
converter->IdToPhone( pstr, wszPhoneme );
CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
Assert( p );
float weight = WeightForPhoneme( W2A( wszPhoneme ) );
p->m_uiStartByte = wordstart + (int)( startWeight * psize );
p->m_uiEndByte = p->m_uiStartByte + (int)( psize * weight );
startWeight += weight;
// Convert to IPA phoneme code
p->m_nPhonemeCode = TextToPhoneme( p->m_szPhoneme );
sentence.AddPhonemeTag( w, p );
number++;
}
}
}
// Free memory
::CoTaskMemFree(pElements);
}
//-----------------------------------------------------------------------------
// Purpose: Create rules for each word in the reference sentence
//-----------------------------------------------------------------------------
typedef struct
{
int ruleId;
SPSTATEHANDLE hRule;
CSpDynamicString word;
char plaintext[ 256 ];
} WORDRULETYPE;
//-----------------------------------------------------------------------------
// Purpose: Creates start for word of sentence
// Input : cpRecoGrammar -
// *root -
// *rules -
// word -
//-----------------------------------------------------------------------------
void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
{
USES_CONVERSION;
HRESULT hr;
WORDRULETYPE *newrule;
int idx = (*rules).AddToTail();
newrule = &(*rules)[ idx ];
newrule->ruleId = DYN_SENTENCERULE + idx + 1;
newrule->word = word;
strcpy( newrule->plaintext, W2T( word ) );
// Create empty rule
hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
Assert( !FAILED( hr ) );
}
//-----------------------------------------------------------------------------
// Purpose:
// Input : cpRecoGrammar -
// *from -
// *to -
//-----------------------------------------------------------------------------
void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
{
USES_CONVERSION;
HRESULT hr;
Assert( from );
if ( from && !to )
{
OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
}
else
{
OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
}
hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
Assert( !FAILED( hr ) );
}
//-----------------------------------------------------------------------------
// Purpose:
// Input : cpRecoGrammar -
// *from -
// *to -
//-----------------------------------------------------------------------------
void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
{
USES_CONVERSION;
HRESULT hr;
Assert( from );
if ( from && !to )
{
OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
}
else
{
OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
}
hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
Assert( !FAILED( hr ) );
}
#define MAX_WORD_SKIP 1
//-----------------------------------------------------------------------------
// Purpose: Links together all word rule states into a sentence rule CFG
// Input : singleword -
// cpRecoGrammar -
// *root -
// *rules -
//-----------------------------------------------------------------------------
bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
{
HRESULT hr;
WORDRULETYPE *rule, *next;
int numrules = (*rules).Size();
rule = &(*rules)[ 0 ];
// Add transition
hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
Assert( !FAILED( hr ) );
for ( int i = 0; i < numrules; i++ )
{
rule = &(*rules)[ i ];
if ( i < numrules - 1 )
{
next = &(*rules)[ i + 1 ];
}
else
{
next = NULL;
}
AddWordTransitionRule( cpRecoGrammar, rule, next );
}
if ( numrules > 1 )
{
for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
{
OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
// Now build rules where you can skip 1 to N intervening words
for ( int i = 1; i < numrules; i++ )
{
// Start at the beginning?
rule = &(*rules)[ i ];
if ( i < numrules - skip )
{
next = &(*rules)[ i + skip ];
}
else
{
continue;
}
// Add transition
AddOptionalTransitionRule( cpRecoGrammar, rule, next );
}
// Go from final rule to end point
AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
}
}
// Store it
hr = cpRecoGrammar->Commit(NULL);
if ( FAILED( hr ) )
return false;
return true;
}
//-----------------------------------------------------------------------------
// Purpose: Debugging, prints alternate list if one is created
// Input : cpResult -
// (*pfnPrint -
//-----------------------------------------------------------------------------
void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
{
ISpPhraseAlt *rgPhraseAlt[ 32 ];
memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
ULONG ulCount;
ISpPhrase *phrase = ( ISpPhrase * )cpResult;
if ( phrase )
{
SPPHRASE *pElements;
if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
{
if ( pElements->Rule.ulCountOfElements > 0 )
{
HRESULT hr = cpResult->GetAlternates(
pElements->Rule.ulFirstElement,
pElements->Rule.ulCountOfElements,
32,
rgPhraseAlt,
&ulCount);
Assert( !FAILED( hr ) );
for ( ULONG r = 0 ; r < ulCount; r++ )
{
CSpDynamicString dstrText;
hr = rgPhraseAlt[ r ]->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
Assert( !FAILED( hr ) );
pfnPrint( "[ ALT ]" );
pfnPrint( dstrText.CopyToChar() );
pfnPrint( "\r\n" );
}
}
}
}
for ( int i = 0; i < 32; i++ )
{
if ( rgPhraseAlt[ i ] )
{
rgPhraseAlt[ i ]->Release();
rgPhraseAlt[ i ] = NULL;
}
}
}
void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
{
char sz[ 256 ];
int i;
pfnPrint( "WORDS\r\n\r\n" );
for ( i = 0 ; i < sentence.m_Words.Size(); i++ )
{
CWordTag *word = sentence.m_Words[ i ];
if ( !word )
continue;
sprintf( sz, "<%u - %u> %s\r\n",
word->m_uiStartByte, word->m_uiEndByte, word->m_pszWord );
pfnPrint( sz );
for ( int j = 0 ; j < word->m_Phonemes.Size(); j++ )
{
CPhonemeTag *phoneme = word->m_Phonemes[ j ];
if ( !phoneme )
continue;
sprintf( sz, " <%u - %u> %s\r\n",
phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->m_szPhoneme );
pfnPrint( sz );
}
}
pfnPrint( "\r\n" );
}
//-----------------------------------------------------------------------------
// Purpose: Given a wave file and a string of words "text", creates a CFG from the
// sentence and stores the resulting words/phonemes in CSentence
// Input : *wavname -
// text -
// sentence -
// (*pfnPrint -
// Output : SR_RESULT
//-----------------------------------------------------------------------------
SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
{
// Assume failure
SR_RESULT result = SR_RESULT_ERROR;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -