📄 neuralnetwork.cpp

📁 基于神经网络的手写体识别程序
💻 CPP
📖 第 1 页 / 共 3 页
字号:
	}
	else
	{
		// TODO: add loading code here
		
		double eta; 
		ar >> eta;
		m_etaLearningRate = eta;  // two-step storage is needed since m_etaLearningRate is "volatile"
		
		int nLayers;
		NNLayer* pLayer = NULL;
		
		ar >> nLayers;
		
		for ( int ii=0; ii<nLayers; ++ii )
		{
			pLayer = new NNLayer( _T(""), pLayer );
			
			m_Layers.push_back( pLayer );
			
			pLayer->Serialize( ar );
		}
		
	}
	
}










///////////////////////////////////////////////////////////////////////
//
//  NNLayer class definition


NNLayer::NNLayer() :
label( _T("") ), m_pPrevLayer( NULL )
{
	Initialize();
}

NNLayer::NNLayer( LPCTSTR str, NNLayer* pPrev /* =NULL */ ) :
label( str ), m_pPrevLayer( pPrev )
{
	Initialize();
}


void NNLayer::Initialize()
{
	VectorWeights::iterator wit;
	VectorNeurons::iterator nit;
	
	for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		delete *nit;
	}
	
	for( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		delete *wit;
	}
	
	m_Weights.clear();
	m_Neurons.clear();
	
	m_bFloatingPointWarning = false;
	
}

NNLayer::~NNLayer()
{
	// call Initialize(); makes sense if you think
	
	Initialize();
}

void NNLayer::Calculate()
{
	ASSERT( m_pPrevLayer != NULL );
	
	VectorNeurons::iterator nit;
	VectorConnections::iterator cit;
	
	double dSum;
	
	for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // to ease the terminology
		
		cit = n.m_Connections.begin();
		
		ASSERT( (*cit).WeightIndex < m_Weights.size() );
		
		dSum = m_Weights[ (*cit).WeightIndex ]->value;  // weight of the first connection is the bias; neuron is ignored
		
		for ( cit++ ; cit<n.m_Connections.end(); cit++ )
		{
			ASSERT( (*cit).WeightIndex < m_Weights.size() );
			ASSERT( (*cit).NeuronIndex < m_pPrevLayer->m_Neurons.size() );
			
			dSum += ( m_Weights[ (*cit).WeightIndex ]->value ) * 
				( m_pPrevLayer->m_Neurons[ (*cit).NeuronIndex ]->output );
		}
		
		n.output = SIGMOID( dSum );
		
	}
	
}



void NNLayer::Backpropagate( std::vector< double >& dErr_wrt_dXn /* in */, 
							std::vector< double >& dErr_wrt_dXnm1 /* out */, 
							std::vector< double >* thisLayerOutput,  // memorized values of this layer's output
							std::vector< double >* prevLayerOutput,  // memorized values of previous layer's output
							double etaLearningRate )
{
	// nomenclature (repeated from NeuralNetwork class):
	//
	// Err is output error of the entire neural net
	// Xn is the output vector on the n-th layer
	// Xnm1 is the output vector of the previous layer
	// Wn is the vector of weights of the n-th layer
	// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
	// F is the squashing function: Xn = F(Yn)
	// F' is the derivative of the squashing function
	//   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
	
	ASSERT( dErr_wrt_dXn.size() == m_Neurons.size() );
	ASSERT( m_pPrevLayer != NULL );
	ASSERT( dErr_wrt_dXnm1.size() == m_pPrevLayer->m_Neurons.size() );
	
	int ii, jj;
	UINT kk;
	int nIndex;
	double output;
	
	std::vector< double > dErr_wrt_dYn( m_Neurons.size() );
	//
	//	std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
	//////////////////////////////////////////////////
	//
	///// DESIGN TRADEOFF: REVIEW !!
	// We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the 
	// differential of the current pattern's error wrt weights in the layer.  However, for layers with
	// many weights, such as fully-connected layers, there are also many weights.  The STL vector
	// class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable 
	// number of page faults, with a consequent slowing of the application's overall execution time.
	
	// To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and 
	// delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
	// errors, and did not improve performance.
	
	// So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
	// write a statement like 
	//    double dErr_wrt_dWn[ m_Weights.size() ];
	// since the compiler insists upon a compile-time known constant value for the size of the array.  
	// To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
	// The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
	// this comment is labeled "REVIEW"
	
	double* dErr_wrt_dWn = (double*)( _alloca( sizeof(double) *  m_Weights.size() ) );
	
	for ( ii=0; ii<m_Weights.size(); ++ii )
	{
		dErr_wrt_dWn[ ii ] =0.0;
	}
	
	
	VectorNeurons::iterator nit;
	VectorConnections::iterator cit;
	
	
	BOOL bMemorized = ( thisLayerOutput != NULL ) && ( prevLayerOutput != NULL );
	
	
	// calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn
	
	for ( ii=0; ii<m_Neurons.size(); ++ii )
	{
		ASSERT( ii<dErr_wrt_dYn.size() );
		ASSERT( ii<dErr_wrt_dXn.size() );
		
		if ( bMemorized != FALSE )
		{
			output = (*thisLayerOutput)[ ii ];
		}
		else
		{
			output = m_Neurons[ ii ]->output;
		}
		
		dErr_wrt_dYn[ ii ] = DSIGMOID( output ) * dErr_wrt_dXn[ ii ];
	}
	
	// calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn
	// For each neuron in this layer, go through the list of connections from the prior layer, and
	// update the differential for the corresponding weight
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk = (*cit).NeuronIndex;
			if ( kk == ULONG_MAX )
			{
				output = 1.0;  // this is the bias weight
			}
			else
			{
				ASSERT( kk<m_pPrevLayer->m_Neurons.size() );
				
				if ( bMemorized != FALSE )
				{
					output = (*prevLayerOutput)[ kk ];
				}
				else
				{
					output = m_pPrevLayer->m_Neurons[ kk ]->output;
				}
			}
			
			////////////	ASSERT( (*cit).WeightIndex < dErr_wrt_dWn.size() );  // since after changing dErr_wrt_dWn to a C-style array, the size() function this won't work
			ASSERT( ii<dErr_wrt_dYn.size() );
			dErr_wrt_dWn[ (*cit).WeightIndex ] += dErr_wrt_dYn[ ii ] * output;
		}
		
		ii++;
	}
	
	
	// calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of
	// dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer
	// For each neuron in this layer
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk=(*cit).NeuronIndex;
			if ( kk != ULONG_MAX )
			{
				// we exclude ULONG_MAX, which signifies the phantom bias neuron with
				// constant output of "1", since we cannot train the bias neuron
				
				nIndex = kk;
				
				ASSERT( nIndex<dErr_wrt_dXnm1.size() );
				ASSERT( ii<dErr_wrt_dYn.size() );
				ASSERT( (*cit).WeightIndex<m_Weights.size() );
				
				dErr_wrt_dXnm1[ nIndex ] += dErr_wrt_dYn[ ii ] * m_Weights[ (*cit).WeightIndex ]->value;
			}
			
		}
		
		ii++;  // ii tracks the neuron iterator
		
	}
	
	struct DOUBLE_UNION
	{
		union 
		{
			double dd;
			unsigned __int64 ullong;
		};
	};
	
	DOUBLE_UNION oldValue, newValue;
	
	// finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta
	// Use an atomic compare-and-exchange operation, which means that another thread might be in 
	// the process of backpropagation and the weights might have shifted slightly
	
	
	double dMicron = ::GetPreferences().m_dMicronLimitParameter;
	double epsilon, divisor;
	
	for ( jj=0; jj<m_Weights.size(); ++jj )
	{
		divisor = m_Weights[ jj ]->diagHessian + dMicron ; 
		
		
		// the following code has been rendered unnecessary, since the value of the Hessian has been
		// verified when it was created, so as to ensure that it is strictly
		// zero-positve.  Thus, it is impossible for the diagHessian to be less than zero,
		// and it is impossible for the divisor to be less than dMicron
		/*
		if ( divisor < dMicron )  
		{
		// it should not be possible to reach here, since everything in the second derviative equations 
		// is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON.
		
		  ASSERT( divisor >= dMicron );
		  divisor = 1.0 ;  // this will limit the size of the update to the same as the size of gloabal eta
		  }
		*/
		
		epsilon = etaLearningRate / divisor;
		oldValue.dd = m_Weights[ jj ]->value;
		newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
		
		
		while ( oldValue.ullong != _InterlockedCompareExchange64( (unsigned __int64*)(&m_Weights[ jj ]->value), 
			newValue.ullong, oldValue.ullong ) ) 
		{
			// another thread must have modified the weight.  Obtain its new value, adjust it, and try again
			
			oldValue.dd = m_Weights[ jj ]->value;
			newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
		}
		
	}
	
}


void NNLayer::PeriodicWeightSanityCheck()
{
	// called periodically by the neural net, to request a check on the "reasonableness" of the 
	// weights.  The warning message is given only once per layer
	
	VectorWeights::iterator wit;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		NNWeight& ww = *(*wit);
		double val = fabs( ww.value );
		
		if ( (val>100.0) && (m_bFloatingPointWarning == false) )
		{
			// 100.0 is an arbitrary value, that no reasonable weight should ever exceed
			
			CString strMess;
			strMess.Format( _T( "Caution: Weights are becoming unboundedly large \n" )
				_T( "Layer: %s \nWeight: %s \nWeight value = %g \nWeight Hessian = %g\n\n" )
				_T( "Suggest abandoning this backpropagation and investigating" ),
				label.c_str(), ww.label.c_str(), ww.value, ww.diagHessian );
			
			::MessageBox( NULL, strMess, _T( "Problem With Weights" ), MB_ICONEXCLAMATION | MB_OK );
			
			m_bFloatingPointWarning = true;
		}
	}
}



void NNLayer::EraseHessianInformation()
{
	// goes through all the weights associated with this layer, and sets each of their
	// diagHessian value to zero
	
	VectorWeights::iterator wit;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		(*wit)->diagHessian = 0.0;
	}
	
}

void NNLayer::DivideHessianInformationBy(double divisor)
{
	// goes through all the weights associated with this layer, and divides each of their
	// diagHessian value by the indicated divisor
	
	VectorWeights::iterator wit;
	double dTemp;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		dTemp = (*wit)->diagHessian;
		
		if ( dTemp < 0.0 )
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -