📄 interface_classes.h
字号:
/* Sets learning parameters of the architecture corresponding to a given action.
a : action
argc : number of supplied arguments
argv : array of arguments
*/
void StateActionFA::setAllLearningParameters(int argc, char* argv[]);
/* Sets (the same) learning parameters of architectures corresponding to each action.
argc : number of supplied arguments
argv : array of arguments
What parameters exactly you send in argv depends on the implementation
of the class inherited from Approximator class: you send parameters
exactly as to the setLearningParameters() function of that class.
*/
};
//////////////////////////////////////////////////////////////////////////////
struct Attributes{//implementation in environment.cpp
double* Entropy; //array where i-th item is the i-step Entropy
int n; //up to which step entropy is computed
double Controllability;
double RiskFactor;
double RFconst;
double RewardVariance;
double TransitionDistance;
double TransitionVariability;
Attributes();
Attributes(int N, double c);
/* N : up to which step the State Transition Entropy should be computed.
c : (multiplicative) threshold for the risk factor (in [0,1)).
*/
void setParameters(int N, double c);
/* Sets parameters for the attributes' calculation:
N : up to which step the State Transition Entropy should be computed.
c : (multiplicative) threshold for the risk factor (in [0,1)).
*/
~Attributes();
};
//////////////////////////////////////////////////////////////////////////////
class Environment {//implementation of some non-virtual functions in environment.cpp
protected:
State CurrentState;//current state
Action CurrentAction;//last action performed by the agent
double reward;//reward after transition to state s under action a
static long idum; //used by a random number generator
static bool seeded;//indicates if the random number generator has been seeded during this program run
public:
Environment();
/* Seeds and initiates random number generator.
This constructor is automatically called when objects of the derived
classes are created.
*/
virtual void startState(State& start, bool& terminal)=0;
/* Samples a start state. Sets CurrentState data member to that state
and also returns it as "start" paramter. Returns an indicatiof if
the sampled state is terminal in "terminal" parameter.
*/
virtual void setState(const State& s, bool& terminal)=0;
/* Sets the CurrentState data member to state "s". Returns an indicatiof if
the sampled state is terminal in "terminal" parameter.
*/
virtual void transition(const Action& action, State& s_new, double& r_new, bool& terminal)=0;
/* Implements a transtion form CurrentState in responce to the "action"
performed by the agent. Updates its internal variables
(CurrentAction and rewrd) and returns values to the agent.
action: action performed by the agent
s_new : return value - new state
r_new : return value - new reward
terminal: indication of whether s_new is a terminal state
*/
virtual bool applicable(const State& s, const Action& a)=0;
/* Checks if action a is applicable in state s.
*/
virtual void bound(int i, bool& bounded, double& left, double& right)=0;
/* Gives bounds on state variables' values
i : index of state variable
bounded: indicates if i^th variable is bounded
left : left bound
right: right bound
*/
void getStateSpaceBounds(double* left, double* right);
/* Returns bounds on state variables.
left : array of left bounds
right : array of right bounds
*/
virtual void uniformStateSample(State& s)=0;
/* Implements uniform state space sampling.
*/
//The following functions empirically measure task attributes
void computeAttributes(Attributes& att, const State& startState, int Steps, int Transitions, const int* n, const ActionSet& as, StateActionFA* fa=NULL);
/* Computes global values of the attributes for the state distribution
as on the trajectory under some policy.
att : attributes structureto return computed values;
startState : state from which to start a random walk;
Steps : maximum number of steps on the trajectory;
Transitions: number of sample transitions from each state;
n : array indicating into how many intervals each state variable ;
should be discretized for the approximate calculation of attributes;
as : action set for the current RL system;
fa : pointer to the architecture that contains action value functions
for each action. According to these value functions,
greedy policy will be exacuted. If it is desired to implement
uniformly random policy, make sure that parameters of the
architectures for all functions are the same (all values are the same).
*/
void computeAttributes(Attributes& att, int SampleSize, int Transitions, const int* n, const ActionSet& as);
/* Computes global values of the attributes for the uniform state
distribution.
att : attributes structureto return computed values;
SampleSize : number of uniformly distributed samples across the state
space in which attribute values are computed and then averaged;
Transitions: number of sample transitions from each state;
n : array indicating into how many intervals each state variable;
as : action set for the current RL system.
*/
double multiStepEntropy(int N, int sampleSize, int Transitions, const int* n, const ActionSet& as);
/* Computes multi-step state transition entropy.
N : number of steps over which entropy should be computed;
sampleSize : number of uniformly distributed samples across the state
space in which attribute values are computed and then averaged;
Transitions: number of sample transitions from each state;
n : array indicating into how many intervals each state variable;
as : action set for the current RL system.
*/
protected:
void chooseAction(double epsilon, StateActionFA* fa, const ActionSet& actions, const State& s, Action& a);
/* Implements an epsilon-greedy strategy based on action value
functions in the architecture pointed to by fa.
epsilon : parameter for the epsilon-greedy strategy;
fa : pointer to the architecture containing action value functions;
actions : action set for the current RL system;
s : state in which to choose action
a : return value - chosen action
*/
void actionSequence(int num, int n, int as_size, int* seq);
/* Used my multiStepEntropy() function.
*/
};
///////////////////////////////////////////////////////////////////////////
class Agent { //implementation of some non-virtual functions in agent.cpp
public:
Agent(double g, const ActionSet& a_s, StateActionFA* const f, Environment* const e);
/* Constructor.
g : discount factor
a_s : action set available to the agent
f : pointer to the architectures containing either action-value functions
or random policy
e : pointer to the environment
*/
int initTrial(int N, bool learning, bool SaveTrajectory, const State* s = NULL, char* fileName = NULL, bool ComputeBellmanError = false);
/* Gets the start state from the environment
and then calls appropriate act function to perform
the trial for a maximum of N steps)
Argument "learning" indicates whether learning should take place.
If yes, actAndLearn() function is called, otherwise
act() function is called.
Computes the return for this trial.
Function returns the number of steps actually performed during the trial.
N : maximal number of steps in the trial
learning : indicates whether learning should take place
SaveTrajectory : indicates whether the trajectory should be saved
fileName : name of the file to which the trajectory should be saved.
ComputeBellmanError : indicates if estimated Bellman Error should be computed. Has a default false value.
*/
double getReturn();
/* Gets return collected during the last trial
*/
double getBellmanError();
/* Returns BellmanError for the last trajectory traversed without learning */
virtual void setLearningParameters(int argc, char *argv[])=0;
/* Sets parameters of the RL learning algorithm
*/
void setArchitectureParameters(const Action& a, int argc, char *argv[]);
/* Sets parameters of the architecture (fa) representing
value function or a policy distribution
argc : number of arguments in argv array
argv : array of arguments
The two above arguments should be as they would be sent
to setArchitectureParameters() function of the derived approximator class.
*/
void saveArchitectureParameters(const Action& a, int argc, char *argv[]);
/* Seves parameters of the architecture (fa) representing
value function or a policy distribution
argc : number of arguments in argv array
argv : array of arguments
The two above arguments should be as they would be sent
to saveArchitectureParameters() function of the derived approximator class.
*/
virtual ~Agent();
protected:
//component structures
struct StageInfo; //fully defined at the end of this class' declaration
struct Trajectory;//fully defined at the end of this class' declaration
//Data members
State CurrentState;//current state of the environment
Action CurrentAction;//action chosen in the current state
bool terminal; //indicates if the state is terminal
double CurrentReward;
const ActionSet& actions; //action set of the RL system
StateActionFA* const fa; /* Pointer to an arcitecture representing
either policy probability distribution
or action-value functions.
*/
double gamma; //discount factor
double Return;//return collected during a trial
Environment* const env;//pointer to the environment object
int* ApplicableActions;//array to be used by chooseAction() function
Trajectory* trajectory;
double BellmanError;
virtual int act(int N, bool SaveTrajectory, bool ComputeBellmanError)=0;
/* Implements maximum of N successive steps of the trial
or until a terminal state is entered by the environment
Communicates with the environment to get current state and reward.
Computes return collected on this trial.
N : maximal number of steps in the trial
SaveTrajectory : indicates whether trajectory should be saved
ComputeBellmanError : indicates whether (estimated) Bellman error
should be computed for the state action pairs on the trajectory.
*/
virtual int actAndLearn(int N, bool SaveTrajectory)=0;
/* Implements maximum of N successive steps of the trial
or until the terminal state is entered by the environment
Communicates with the environment to get
current state and reward.
Calls the rlAlgorithm when appropriate.
Computes return collected on this trial.
N : maximal number of steps in the trial;
SaveTrajectory : indicates whether trajectory should be saved
*/
virtual void chooseAction(const State& s, Action& a) =0 ;
/* Implements behavior policy
Uses fa - representation of the random policy or the action value functions
s : state in which the action should be performed
a : chosen action
*/
// component structures
struct StageInfo{
State state;
Action action;
double reward;
double* Qvalue;
double TDerror;
StageInfo(){
Qvalue = new double[Action::count];
}
~StageInfo(){
delete [] Qvalue;
}
};
struct Trajectory{
StageInfo* stage;
int length; //actual length of the recorded trajectory
Trajectory(int n){
// n is the maximal number of stages in the trjectory
stage = new StageInfo[n];
length=0;
}
~Trajectory(){
delete [] stage;
}
};
};
int tokenize(char* sep, char* str, int** tokens);
/* Extracts tokens from string "str", which are separated by separators
specified in "sep". Allocates array (*tokens) of the appropriate size
and saves extracted tokens in that array. Returns the number of tokens.
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -