📄 neuralnetwork.cpp
字号:
}
else
{
// TODO: add loading code here
double eta;
ar >> eta;
m_etaLearningRate = eta; // two-step storage is needed since m_etaLearningRate is "volatile"
int nLayers;
NNLayer* pLayer = NULL;
ar >> nLayers;
for ( int ii=0; ii<nLayers; ++ii )
{
pLayer = new NNLayer( _T(""), pLayer );
m_Layers.push_back( pLayer );
pLayer->Serialize( ar );
}
}
}
///////////////////////////////////////////////////////////////////////
//
// NNLayer class definition
NNLayer::NNLayer() :
label( _T("") ), m_pPrevLayer( NULL )
{
Initialize();
}
NNLayer::NNLayer( LPCTSTR str, NNLayer* pPrev /* =NULL */ ) :
label( str ), m_pPrevLayer( pPrev )
{
Initialize();
}
void NNLayer::Initialize()
{
VectorWeights::iterator wit;
VectorNeurons::iterator nit;
for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
delete *nit;
}
for( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
{
delete *wit;
}
m_Weights.clear();
m_Neurons.clear();
m_bFloatingPointWarning = false;
}
NNLayer::~NNLayer()
{
// call Initialize(); makes sense if you think
Initialize();
}
void NNLayer::Calculate()
{
ASSERT( m_pPrevLayer != NULL );
VectorNeurons::iterator nit;
VectorConnections::iterator cit;
double dSum;
for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit); // to ease the terminology
cit = n.m_Connections.begin();
ASSERT( (*cit).WeightIndex < m_Weights.size() );
dSum = m_Weights[ (*cit).WeightIndex ]->value; // weight of the first connection is the bias; neuron is ignored
for ( cit++ ; cit<n.m_Connections.end(); cit++ )
{
ASSERT( (*cit).WeightIndex < m_Weights.size() );
ASSERT( (*cit).NeuronIndex < m_pPrevLayer->m_Neurons.size() );
dSum += ( m_Weights[ (*cit).WeightIndex ]->value ) *
( m_pPrevLayer->m_Neurons[ (*cit).NeuronIndex ]->output );
}
n.output = SIGMOID( dSum );
}
}
void NNLayer::Backpropagate( std::vector< double >& dErr_wrt_dXn /* in */,
std::vector< double >& dErr_wrt_dXnm1 /* out */,
std::vector< double >* thisLayerOutput, // memorized values of this layer's output
std::vector< double >* prevLayerOutput, // memorized values of previous layer's output
double etaLearningRate )
{
// nomenclature (repeated from NeuralNetwork class):
//
// Err is output error of the entire neural net
// Xn is the output vector on the n-th layer
// Xnm1 is the output vector of the previous layer
// Wn is the vector of weights of the n-th layer
// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
// F is the squashing function: Xn = F(Yn)
// F' is the derivative of the squashing function
// Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
ASSERT( dErr_wrt_dXn.size() == m_Neurons.size() );
ASSERT( m_pPrevLayer != NULL );
ASSERT( dErr_wrt_dXnm1.size() == m_pPrevLayer->m_Neurons.size() );
int ii, jj;
UINT kk;
int nIndex;
double output;
std::vector< double > dErr_wrt_dYn( m_Neurons.size() );
//
// std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 ); // important to initialize to zero
//////////////////////////////////////////////////
//
///// DESIGN TRADEOFF: REVIEW !!
// We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the
// differential of the current pattern's error wrt weights in the layer. However, for layers with
// many weights, such as fully-connected layers, there are also many weights. The STL vector
// class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable
// number of page faults, with a consequent slowing of the application's overall execution time.
// To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and
// delete[]'ing it at the end of the function. However, this caused the same number of page-fault
// errors, and did not improve performance.
// So I tried a plain-old C array allocated on the stack (i.e., not the heap). Of course I could not
// write a statement like
// double dErr_wrt_dWn[ m_Weights.size() ];
// since the compiler insists upon a compile-time known constant value for the size of the array.
// To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
// The downside of this is excessive stack usage, and there might be stack overflow probelms. That's why
// this comment is labeled "REVIEW"
double* dErr_wrt_dWn = (double*)( _alloca( sizeof(double) * m_Weights.size() ) );
for ( ii=0; ii<m_Weights.size(); ++ii )
{
dErr_wrt_dWn[ ii ] =0.0;
}
VectorNeurons::iterator nit;
VectorConnections::iterator cit;
BOOL bMemorized = ( thisLayerOutput != NULL ) && ( prevLayerOutput != NULL );
// calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn
for ( ii=0; ii<m_Neurons.size(); ++ii )
{
ASSERT( ii<dErr_wrt_dYn.size() );
ASSERT( ii<dErr_wrt_dXn.size() );
if ( bMemorized != FALSE )
{
output = (*thisLayerOutput)[ ii ];
}
else
{
output = m_Neurons[ ii ]->output;
}
dErr_wrt_dYn[ ii ] = DSIGMOID( output ) * dErr_wrt_dXn[ ii ];
}
// calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn
// For each neuron in this layer, go through the list of connections from the prior layer, and
// update the differential for the corresponding weight
ii = 0;
for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit); // for simplifying the terminology
for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
{
kk = (*cit).NeuronIndex;
if ( kk == ULONG_MAX )
{
output = 1.0; // this is the bias weight
}
else
{
ASSERT( kk<m_pPrevLayer->m_Neurons.size() );
if ( bMemorized != FALSE )
{
output = (*prevLayerOutput)[ kk ];
}
else
{
output = m_pPrevLayer->m_Neurons[ kk ]->output;
}
}
//////////// ASSERT( (*cit).WeightIndex < dErr_wrt_dWn.size() ); // since after changing dErr_wrt_dWn to a C-style array, the size() function this won't work
ASSERT( ii<dErr_wrt_dYn.size() );
dErr_wrt_dWn[ (*cit).WeightIndex ] += dErr_wrt_dYn[ ii ] * output;
}
ii++;
}
// calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of
// dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer
// For each neuron in this layer
ii = 0;
for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit); // for simplifying the terminology
for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
{
kk=(*cit).NeuronIndex;
if ( kk != ULONG_MAX )
{
// we exclude ULONG_MAX, which signifies the phantom bias neuron with
// constant output of "1", since we cannot train the bias neuron
nIndex = kk;
ASSERT( nIndex<dErr_wrt_dXnm1.size() );
ASSERT( ii<dErr_wrt_dYn.size() );
ASSERT( (*cit).WeightIndex<m_Weights.size() );
dErr_wrt_dXnm1[ nIndex ] += dErr_wrt_dYn[ ii ] * m_Weights[ (*cit).WeightIndex ]->value;
}
}
ii++; // ii tracks the neuron iterator
}
struct DOUBLE_UNION
{
union
{
double dd;
unsigned __int64 ullong;
};
};
DOUBLE_UNION oldValue, newValue;
// finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta
// Use an atomic compare-and-exchange operation, which means that another thread might be in
// the process of backpropagation and the weights might have shifted slightly
double dMicron = ::GetPreferences().m_dMicronLimitParameter;
double epsilon, divisor;
for ( jj=0; jj<m_Weights.size(); ++jj )
{
divisor = m_Weights[ jj ]->diagHessian + dMicron ;
// the following code has been rendered unnecessary, since the value of the Hessian has been
// verified when it was created, so as to ensure that it is strictly
// zero-positve. Thus, it is impossible for the diagHessian to be less than zero,
// and it is impossible for the divisor to be less than dMicron
/*
if ( divisor < dMicron )
{
// it should not be possible to reach here, since everything in the second derviative equations
// is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON.
ASSERT( divisor >= dMicron );
divisor = 1.0 ; // this will limit the size of the update to the same as the size of gloabal eta
}
*/
epsilon = etaLearningRate / divisor;
oldValue.dd = m_Weights[ jj ]->value;
newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
while ( oldValue.ullong != _InterlockedCompareExchange64( (unsigned __int64*)(&m_Weights[ jj ]->value),
newValue.ullong, oldValue.ullong ) )
{
// another thread must have modified the weight. Obtain its new value, adjust it, and try again
oldValue.dd = m_Weights[ jj ]->value;
newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
}
}
}
void NNLayer::PeriodicWeightSanityCheck()
{
// called periodically by the neural net, to request a check on the "reasonableness" of the
// weights. The warning message is given only once per layer
VectorWeights::iterator wit;
for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
{
NNWeight& ww = *(*wit);
double val = fabs( ww.value );
if ( (val>100.0) && (m_bFloatingPointWarning == false) )
{
// 100.0 is an arbitrary value, that no reasonable weight should ever exceed
CString strMess;
strMess.Format( _T( "Caution: Weights are becoming unboundedly large \n" )
_T( "Layer: %s \nWeight: %s \nWeight value = %g \nWeight Hessian = %g\n\n" )
_T( "Suggest abandoning this backpropagation and investigating" ),
label.c_str(), ww.label.c_str(), ww.value, ww.diagHessian );
::MessageBox( NULL, strMess, _T( "Problem With Weights" ), MB_ICONEXCLAMATION | MB_OK );
m_bFloatingPointWarning = true;
}
}
}
void NNLayer::EraseHessianInformation()
{
// goes through all the weights associated with this layer, and sets each of their
// diagHessian value to zero
VectorWeights::iterator wit;
for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
{
(*wit)->diagHessian = 0.0;
}
}
void NNLayer::DivideHessianInformationBy(double divisor)
{
// goes through all the weights associated with this layer, and divides each of their
// diagHessian value by the indicated divisor
VectorWeights::iterator wit;
double dTemp;
for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
{
dTemp = (*wit)->diagHessian;
if ( dTemp < 0.0 )
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -