📄 neuralnetwork.cpp
字号:
{
// it should not be possible to reach here, since all calculations for the second
// derviative are strictly zero-positive. However, there are some early indications
// that this check is necessary anyway
ASSERT ( dTemp >= 0.0 ); // will break in debug mode
dTemp = 0.0;
}
(*wit)->diagHessian = dTemp / divisor ;
}
}
void NNLayer::BackpropagateSecondDerivatives( std::vector< double >& d2Err_wrt_dXn /* in */,
std::vector< double >& d2Err_wrt_dXnm1 /* out */)
{
// nomenclature (repeated from NeuralNetwork class)
// NOTE: even though we are addressing SECOND derivatives ( and not first derivatives),
// we use nearly the same notation as if there were first derivatives, since otherwise the
// ASCII look would be confusing. We add one "2" but not two "2's", such as "d2Err_wrt_dXn",
// to give a gentle emphasis that we are using second derivatives
//
// Err is output error of the entire neural net
// Xn is the output vector on the n-th layer
// Xnm1 is the output vector of the previous layer
// Wn is the vector of weights of the n-th layer
// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
// F is the squashing function: Xn = F(Yn)
// F' is the derivative of the squashing function
// Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
ASSERT( d2Err_wrt_dXn.size() == m_Neurons.size() );
ASSERT( m_pPrevLayer != NULL );
ASSERT( d2Err_wrt_dXnm1.size() == m_pPrevLayer->m_Neurons.size() );
int ii, jj;
UINT kk;
int nIndex;
double output;
double dTemp;
std::vector< double > d2Err_wrt_dYn( m_Neurons.size() );
//
// std::vector< double > d2Err_wrt_dWn( m_Weights.size(), 0.0 ); // important to initialize to zero
//////////////////////////////////////////////////
//
///// DESIGN TRADEOFF: REVIEW !!
//
// Note that the reasoning of this comment is identical to that in the NNLayer::Backpropagate()
// function, from which the instant BackpropagateSecondDerivatives() function is derived from
//
// We would prefer (for ease of coding) to use STL vector for the array "d2Err_wrt_dWn", which is the
// second differential of the current pattern's error wrt weights in the layer. However, for layers with
// many weights, such as fully-connected layers, there are also many weights. The STL vector
// class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable
// number of page faults, with a consequent slowing of the application's overall execution time.
// To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and
// delete[]'ing it at the end of the function. However, this caused the same number of page-fault
// errors, and did not improve performance.
// So I tried a plain-old C array allocated on the stack (i.e., not the heap). Of course I could not
// write a statement like
// double d2Err_wrt_dWn[ m_Weights.size() ];
// since the compiler insists upon a compile-time known constant value for the size of the array.
// To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
// The downside of this is excessive stack usage, and there might be stack overflow probelms. That's why
// this comment is labeled "REVIEW"
double* d2Err_wrt_dWn = (double*)( _alloca( sizeof(double) * m_Weights.size() ) );
for ( ii=0; ii<m_Weights.size(); ++ii )
{
d2Err_wrt_dWn[ ii ] =0.0;
}
VectorNeurons::iterator nit;
VectorConnections::iterator cit;
// calculate d2Err_wrt_dYn = ( F'(Yn) )^2 * dErr_wrt_Xn (where dErr_wrt_Xn is actually a second derivative )
for ( ii=0; ii<m_Neurons.size(); ++ii )
{
ASSERT( ii<d2Err_wrt_dYn.size() );
ASSERT( ii<d2Err_wrt_dXn.size() );
output = m_Neurons[ ii ]->output;
dTemp = DSIGMOID( output ) ;
d2Err_wrt_dYn[ ii ] = d2Err_wrt_dXn[ ii ] * dTemp * dTemp;
}
// calculate d2Err_wrt_Wn = ( Xnm1 )^2 * d2Err_wrt_Yn (where dE2rr_wrt_Yn is actually a second derivative)
// For each neuron in this layer, go through the list of connections from the prior layer, and
// update the differential for the corresponding weight
ii = 0;
for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit); // for simplifying the terminology
for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
{
kk = (*cit).NeuronIndex;
if ( kk == ULONG_MAX )
{
output = 1.0; // this is the bias connection; implied neuron output of "1"
}
else
{
ASSERT( kk<m_pPrevLayer->m_Neurons.size() );
output = m_pPrevLayer->m_Neurons[ kk ]->output;
}
//////////// ASSERT( (*cit).WeightIndex < d2Err_wrt_dWn.size() ); // since after changing d2Err_wrt_dWn to a C-style array, the size() function this won't work
ASSERT( ii<d2Err_wrt_dYn.size() );
d2Err_wrt_dWn[ (*cit).WeightIndex ] += d2Err_wrt_dYn[ ii ] * output * output ;
}
ii++;
}
// calculate d2Err_wrt_Xnm1 = ( Wn )^2 * d2Err_wrt_dYn (where d2Err_wrt_dYn is a second derivative not a first).
// d2Err_wrt_Xnm1 is needed as the input value of
// d2Err_wrt_Xn for backpropagation of second derivatives for the next (i.e., previous spatially) layer
// For each neuron in this layer
ii = 0;
for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit); // for simplifying the terminology
for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
{
kk=(*cit).NeuronIndex;
if ( kk != ULONG_MAX )
{
// we exclude ULONG_MAX, which signifies the phantom bias neuron with
// constant output of "1", since we cannot train the bias neuron
nIndex = kk;
ASSERT( nIndex<d2Err_wrt_dXnm1.size() );
ASSERT( ii<d2Err_wrt_dYn.size() );
ASSERT( (*cit).WeightIndex<m_Weights.size() );
dTemp = m_Weights[ (*cit).WeightIndex ]->value ;
d2Err_wrt_dXnm1[ nIndex ] += d2Err_wrt_dYn[ ii ] * dTemp * dTemp ;
}
}
ii++; // ii tracks the neuron iterator
}
struct DOUBLE_UNION
{
union
{
double dd;
unsigned __int64 ullong;
};
};
DOUBLE_UNION oldValue, newValue;
// finally, update the diagonal Hessians for the weights of this layer neuron using dErr_wrt_dW.
// By design, this function (and its iteration over many (approx 500 patterns) is called while a
// single thread has locked the nueral network, so there is no possibility that another
// thread might change the value of the Hessian. Nevertheless, since it's easy to do, we
// use an atomic compare-and-exchange operation, which means that another thread might be in
// the process of backpropagation of second derivatives and the Hessians might have shifted slightly
for ( jj=0; jj<m_Weights.size(); ++jj )
{
oldValue.dd = m_Weights[ jj ]->diagHessian;
newValue.dd = oldValue.dd + d2Err_wrt_dWn[ jj ];
while ( oldValue.ullong != _InterlockedCompareExchange64( (unsigned __int64*)(&m_Weights[ jj ]->diagHessian),
newValue.ullong, oldValue.ullong ) )
{
// another thread must have modified the weight. Obtain its new value, adjust it, and try again
oldValue.dd = m_Weights[ jj ]->diagHessian;
newValue.dd = oldValue.dd + d2Err_wrt_dWn[ jj ];
}
}
}
void NNLayer::Serialize(CArchive &ar)
{
VectorNeurons::iterator nit;
VectorWeights::iterator wit;
VectorConnections::iterator cit;
int ii, jj;
if (ar.IsStoring())
{
// TODO: add storing code here
ar.WriteString( label.c_str() );
ar.WriteString( _T("\r\n") ); // ar.ReadString will look for \r\n when loading from the archive
ar << m_Neurons.size();
ar << m_Weights.size();
for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
{
NNNeuron& n = *(*nit);
ar.WriteString( n.label.c_str() );
ar.WriteString( _T("\r\n") );
ar << n.m_Connections.size();
for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
{
ar << (*cit).NeuronIndex;
ar << (*cit).WeightIndex;
}
}
for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
{
ar.WriteString( (*wit)->label.c_str() );
ar.WriteString( _T("\r\n") );
ar << (*wit)->value;
}
}
else
{
// TODO: add loading code here
CString str;
ar.ReadString( str );
label = str;
int iNumNeurons, iNumWeights, iNumConnections;
double value;
NNNeuron* pNeuron;
NNWeight* pWeight;
NNConnection conn;
ar >> iNumNeurons;
ar >> iNumWeights;
for ( ii=0; ii<iNumNeurons; ++ii )
{
ar.ReadString( str );
pNeuron = new NNNeuron( (LPCTSTR)str );
m_Neurons.push_back( pNeuron );
ar >> iNumConnections;
for ( jj=0; jj<iNumConnections; ++jj )
{
ar >> conn.NeuronIndex;
ar >> conn.WeightIndex;
pNeuron->AddConnection( conn );
}
}
for ( jj=0; jj<iNumWeights; ++jj )
{
ar.ReadString( str );
ar >> value;
pWeight = new NNWeight( (LPCTSTR)str, value );
m_Weights.push_back( pWeight );
}
}
}
///////////////////////////////////////////////////////////////////////
//
// NNWeight
NNWeight::NNWeight() :
label( _T("") ),
value( 0.0 ), diagHessian( 0.0 )
{
Initialize();
}
NNWeight::NNWeight( LPCTSTR str, double val /* =0.0 */ ) :
label( str ),
value( val ), diagHessian( 0.0 )
{
Initialize();
}
void NNWeight::Initialize()
{
}
NNWeight::~NNWeight()
{
}
///////////////////////////////////////////////////////////////////////
//
// NNNeuron
NNNeuron::NNNeuron() :
label( _T("") ), output( 0.0 )
{
Initialize();
}
NNNeuron::NNNeuron( LPCTSTR str ) :
label( str ), output( 0.0 )
{
Initialize();
}
void NNNeuron::Initialize()
{
m_Connections.clear();
}
NNNeuron::~NNNeuron()
{
Initialize();
}
void NNNeuron::AddConnection( UINT iNeuron, UINT iWeight )
{
m_Connections.push_back( NNConnection( iNeuron, iWeight ) );
}
void NNNeuron::AddConnection( NNConnection const & conn )
{
m_Connections.push_back( conn );
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -