📄 cpolicies.h

📁 强化学习算法（R-Learning）难得的珍贵资料
💻 H
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Copyright (C) 2003
// Gerhard Neumann (gerhard@igi.tu-graz.ac.at)

//                
// This file is part of RL Toolbox.
// http://www.igi.tugraz.at/ril_toolbox
//
// All rights reserved.
// 
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef CTDLEARNERPOLICIES_H
#define CTDLEARNERPOLICIES_H

#include "cagentcontroller.h"
#include "cqfunction.h"
#include "cactionstatistics.h"
#include "cstateproperties.h"
#include "ctransitionfunction.h"
#include "ril_debug.h"

class CAbstractFeatureStochasticEstimatedModel;
class CTransitionFunction;

/// Greedy Policy based on a Q-Function
/** 
This policy always takes the greedy action (action with the highest Q-Value). The policy can't be used as stochastic policy, if a stochastic greedy policy is needed take CQStochasticPolicy with a greedy distribution.
*/
class CQGreedyPolicy : public CAgentController
{
protected:
	CAbstractQFunction *qFunction;
public:
	CQGreedyPolicy(CActionSet *actions, CAbstractQFunction *qFunction);

	/// Always returns the greedy action
	virtual CAction *getNextAction(CStateCollection *state, CActionDataSet *data = NULL);

};

/// Action Distribution classes define the distributions of stochastic Policies
/** 
Action Distribution calculate the distribution for sampling an action, which is done by the class CStochasticPolicy. The distribution calculation usually depends on some kind of Q-Value of the actions. This is done in the function getDistribution. The function gets as input the current state, all available actions, and the Q-Values (actually it can be any kind of value, rating an action) of the actions as a rlt_real array. Usually only this Q-Values are used for the distribution (the state is only used for special exploration policies). The function has to overwrite the Q-Values rlt_real array with the distribution values.
Additionally  some algorithm needs a differntiable distribution. Therefore the interface provides the function isDifferentiable (since not all distributions are differentiable) and the function getGradientFactors. The function calculates the gradient dP(usedaction|actionFactors)/ (d_actionfactors). The actionfactors are again some kind of rating for the actions. The result has to be written in the output vector gradientfactors. This vector has always the same size as the actionfactors array (so the number of actions). Only the SoftMax Distribution supports calculating this gradient.  
*/
class CActionDistribution : virtual public CParameterObject
{
public:
	/// Returns the distribution of the actions that is sampled by an stochastic policy
/** 
The function gets as input the current state, all available actions, and the Q-Values (actually it can be any kind of value, rating an action) of the actions as a rlt_real array. Usually only this Q-Values are used for the distribution (the state is only used for special exploration policies). The function has to overwrite the Q-Values rlt_real array with the distribution values.
*/
	virtual void getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *actionFactors) = 0;
	virtual bool isDifferentiable() {return false;};

/// Calculates the derivation of the probability of choosing the specified action.
/**	 The function calculates the gradient dP(usedaction|actionFactors)/ (d_actionfactors). The actionfactors are again some kind of rating for the actions. The result has to be written in the output vector gradientfactors. This vector has always the same size as the actionfactors array (so the number of actions). Only the SoftMax Distribution supports calculating this gradient.*/
	virtual void getGradientFactors(CStateCollection *state, CAction *usedAction, CActionSet *actions, rlt_real *actionFactors, CMyVector *gradientFactors) {};
};

///Soft Max Distribution for Stochastic Policies. 
/**
This class implements the well known softmax distribution (sometimes calles Gibs distribution). The Softmax Distribution is differentiable and therefore can be used for policy gradient algorithms. The Distribution depends on the parameter "SoftMaxBeta" which specifies you the "greediness" of your distribution.
<p>
The class CSoftMaxDistribution has the following Parameters: 
- "SoftMaxBeta" : Greediness of the distribution 
*/

class CSoftMaxDistribution : public CActionDistribution
{
protected:
public:

	CSoftMaxDistribution(rlt_real beta);

	virtual void getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values);

	virtual bool isDifferentiable() {return true;};

	virtual void getGradientFactors(CStateCollection *state, CAction *usedAction, CActionSet *actions, rlt_real *actionFactors, CMyVector *gradientFactors);

};

class CAbsoluteSoftMaxDistribution : public CActionDistribution
{
protected:
public:

	CAbsoluteSoftMaxDistribution(rlt_real maxAbsValue);

	virtual void getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values);

	virtual bool isDifferentiable() {return false;};

	//virtual void getGradientFactors(CStateCollection *state, CAction *usedAction, CActionSet *actions, rlt_real *actionFactors, CMyVector *gradientFactors);
};

///Class for a greedy action distribution. 
/** 
This class implements a greedy action distribution, so the probability for the best rated action is always 1, and for the rest 0. If there are more than one greedy action, always the first action will be taken. Its understood that this distribution is not differentiable. 
*/

class CGreedyDistribution : public CActionDistribution
{
public:
	virtual void getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values);
};

/// Class for the epsilon greedy action distribution. 
/**This class implements the epsilon greedy action distribution. Epsilon greedy policies take the greedy (best rated) action with probability (1 - epsilon) and a random action with probability epsilon. If there are more than one greedy action, always the first action will be taken. To set epsilon please use the parameter "EpsilonGreedy" or the constructor of the class. Its understood that this distribution is not differentiable.
<p>
The class CEpsilonGreedyDistribution has following Parameters:
- "EpsilonGreedy" : epsilon  
*/
class CEpsilonGreedyDistribution : public CActionDistribution
{
protected:
public:
//	rlt_real epsilon;

	CEpsilonGreedyDistribution(rlt_real epsilon);
	virtual void getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values);
};


/// Class for modeling a stochastic policy. 
/**
Many algorithm need more than just a specific action for a specific state, especially when the policy is a stochastic policy very often the distribution for choosing an action is needed. This is modeled by CStochasticPolicy. The Policy choses an action according to a given propability distribution, you can specify this distribution in the constructor with the CActionDistribution object. In the getNextAction Method an action is chosen according the distribution returned by getActionProbabilities. The getActionProbabilities method has to call the getDistribution method from the CActionDistribution object with the action rating as input. How this action rating is calculated has to be implemented by the subclasses, usually the values comes from a Q-Function (see CQStochasticPolicy). Some algorithms like the policy gradient algorithm need a differentiable action distribution. CStochasticPolicy also provides an interface for differentiate your distribution with respect to the policy weights (weights of the Q-Function). 

The gradient calculation of the policy is already implemented. You have the possibility to calculate dP(action| state)/ dweights or the logarithmic gradient which is the same as dP(action| state)/ dweights * 1 / P(action | state). Calculating the gradient of the action ratings (e.g. dQ(a,s)/dw for QFunctions) has to be implemented in the function getActionGradient if the stochastic policy is supposed to be differentiable. Differentiable policies also have to overwrite the function isDifferentiable, which always returns false for the base class. Wether the policy is differentiable or not depends on the kind of action ratings and on the distribution. Both of them have to be differentiable. 
The class als provides the possibility to get a statistics object for the action which was chosed. This is done by the virtual function getActionStatistics, which is called by the getNextAction Function if an statistics object is requestet. 
*/


class CStochasticPolicy: public CAgentStatisticController
{
protected:
/// array to store the current action propabilites
	rlt_real *actionValues;
	CActionDistribution *distribution;

	CMyVector *gradientFactors;

	CFeatureList *actionGradientFeatures;

/// virtual function for gettin the action statistic for the chosen action
/**The class als provides the possibility to get a statistics object for the action which was chosed. This is done
by the virtual function getActionStatistics, which is called by the getNextAction Function if an statistics object is requestet.
*/
	virtual void getActionStatistics(CStateCollection *state, CAction *action, CActionStatistics *stat) {};

public:
	///Creates a stochastic policy which can choose from the actions in "actions".
	CStochasticPolicy(CActionSet *actions, CActionDistribution *distribution);
	~CStochasticPolicy();

	/// virtual function for retrieving the action propability distribution 

	/**
	For each action in the availableActions action set, the function has to calculate the propability and write it in the rlt_real array actionValues. The function first calculates the action ratings with the function getNextAction and then calculates the action distribution with the action distribution object
*/
	virtual void getActionProbabilities(CStateCollection *state, CActionSet *availableActions, rlt_real *actionValues, CActionDataSet *actionDataSet = NULL);
/// Choses an action according the distribution from getActionPropability.
/**
First of all the available actions for the current state are calculated, and then the propabilities for this avialable
actions. Then an action is chosen from the available actions set according the distribution.
*/
	virtual CAction *getNextAction(CStateCollection *state, CActionDataSet *dataset, CActionStatistics *stat);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -