📄 cvaps.cpp
字号:
// Copyright (C) 2003
// Gerhard Neumann (gerhard@igi.tu-graz.ac.at)
//
// This file is part of RL Toolbox.
// http://www.igi.tugraz.at/ril_toolbox
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ril_debug.h"
#include "cvaps.h"
#include <math.h>
CGradientEpisodeLearner::CGradientEpisodeLearner(CRewardFunction *reward, CErrorFunction *error) : CSemiMDPRewardListener(reward)
{
this->errorFunction = error;
this->dWBuf = new CFeatureList();
this->dWTrace = new CFeatureList();
addParameter("MaxSteps", -1);
addParameter("LearningRate", 0.0001);
addParameter("ETraceFactor", 0.9 * 0.95);
addParameters(errorFunction);
this->nSteps = 0;
dWETraces = new CGradientVETraces(NULL);
}
CGradientEpisodeLearner::~CGradientEpisodeLearner()
{
delete dWBuf;
delete dWTrace;
delete dWETraces;
}
void CGradientEpisodeLearner::nextStep(CStateCollection *oldState, CAction *action, rlt_real reward, CStateCollection *newState)
{
CFeatureList::iterator it;
nSteps ++;
// update dW
dWBuf->clear();
rlt_real derivatedError = errorFunction->getDerivatedError(oldState, action, reward, newState, NULL);
errorFunction->getErrorGradient(oldState, action, newState, reward, dWBuf);
if (dWETraces)
{
dWETraces->multETraces(pow(getParameter("ETraceFactor"), action->getDuration()));
dWETraces->addGradientETrace(dWBuf, 1.0);
for (it = dWETraces->getGradientETraces()->begin(); it != dWETraces->getGradientETraces()->end(); it ++)
{
dWTrace->update((*it)->featureIndex, (*it)->factor * derivatedError);
}
}
else
{
for (it = dWBuf->begin(); it != dWBuf->end(); it ++)
{
dWTrace->update((*it)->featureIndex, (*it)->factor * derivatedError);
}
}
if (DebugIsEnabled('t'))
{
DebugPrint('t', "dWTrace:");
dWTrace->saveASCII(DebugGetFileHandle('p'));
DebugPrint('t', "\n");
}
if (getParameter("MaxSteps") > 0 && nSteps >= getParameter("MaxSteps"))
{
DebugPrint('t', "GradientEpisodeLearner: Steps (%d) > MaxSteps (%d)\n", nSteps, getParameter("MaxSteps"));
newEpisode();
}
}
void CGradientEpisodeLearner::newEpisode()
{
updateGradient();
resetTraces();
}
void CGradientEpisodeLearner::updateGradient()
{
//rlt_real norm = sqrt(dWTrace->multFeatureList(dWTrace));
dWTrace->multFactor(- getAlpha());
errorFunction->updateGradient(dWTrace);
}
void CGradientEpisodeLearner::resetTraces()
{
dWTrace->clear();
dWETraces->resetETraces();
nSteps = 0;
}
void CGradientEpisodeLearner::setAlpha(rlt_real value)
{
setParameter("LearningRate", value);
}
rlt_real CGradientEpisodeLearner::getAlpha()
{
return getParameter("LearningRate");
}
void CGradientEpisodeLearner::setMaxSteps(int maxSteps)
{
setParameter("MaxSteps", maxSteps);
}
int CGradientEpisodeLearner::getMaxSteps()
{
return my_round(getParameter("MaxSteps"));
}
CVAPSGradientLearner::CVAPSGradientLearner(CRewardFunction *reward, CErrorFunction *error, CStochasticPolicy *policy) : CGradientEpisodeLearner(reward, error)
{
assert(policy->isDifferentiable());
this->policy = policy;
addParameters(policy);
addParameter("PolicyLearningRate", 1.0);
this->dTTrace = new CFeatureList();
this->dTBuf = new CFeatureList();
}
CVAPSGradientLearner::~CVAPSGradientLearner()
{
delete dTTrace;
delete dTBuf;
}
void CVAPSGradientLearner::nextStep(CStateCollection *oldState, CAction *action, rlt_real reward, CStateCollection *newState)
{
rlt_real error = errorFunction->getError(oldState, action, reward, newState);
// Update T
dTBuf->clear();
policy->getActionProbabilityLnGradient(oldState, action, action->getActionData(), dTBuf);
CFeatureList::iterator it;
for (it = dTBuf->begin(); it != dTBuf->end(); it ++)
{
dTTrace->update((*it)->featureIndex, (*it)->factor);
}
// update dW
rlt_real policyLearningRate = getParameter("PolicyLearningRate");
for (it = dTTrace->begin(); it != dTTrace->end(); it ++)
{
dWTrace->update((*it)->featureIndex, (*it)->factor * error * policyLearningRate);
}
CGradientEpisodeLearner::nextStep(oldState, action, reward, newState);
if (DebugIsEnabled('t'))
{
DebugPrint('t', "dTTrace:");
dTTrace->saveASCII(DebugGetFileHandle('p'));
DebugPrint('t', "\n");
}
}
void CVAPSGradientLearner::resetTraces()
{
CGradientEpisodeLearner::resetTraces();
dTTrace->clear();
}
CVAPSGradientCalculator::CVAPSGradientCalculator(CRewardFunction *reward, CErrorFunction *error, CStochasticPolicy *policy, CAgent *agent, int TSteps) : CPolicyGradientCalculator(policy), CVAPSGradientLearner(reward, error, policy)
{
this->agent = agent;
addParameter("GradientEstimationSteps", TSteps);
}
CVAPSGradientCalculator::~CVAPSGradientCalculator()
{
rlt_real norm = sqrt(dWTrace->multFeatureList(dWTrace));
dWTrace->multFactor(- getAlpha() / norm);
gradient->add(dWTrace);
}
void CVAPSGradientCalculator::updateGradient()
{
rlt_real norm = sqrt(dWTrace->multFeatureList(dWTrace));
dWTrace->multFactor(- 1.0 / norm);
gradient->add(dWTrace);
}
void CVAPSGradientCalculator::getGradient(CFeatureList *gradient)
{
this->gradient = gradient;
int actualSteps = 0;
int TSteps = my_round(getParameter("GradientEstimationSteps"));
agent->startNewEpisode();
bool bListen = agent->isListenerAdded(this);
if (!bListen)
{
agent->addSemiMDPListener(this);
}
printf("Calculating PGradient with %d steps\n", TSteps);
while (actualSteps < TSteps)
{
actualSteps += agent->doControllerEpisode(1, TSteps - actualSteps);
}
if (!bListen)
{
agent->removeSemiMDPListener(this);
}
if (DebugIsEnabled('g'))
{
DebugPrint('g', "Calculated VAPS Gradient (%d steps)\n", TSteps);
gradient->saveASCII(DebugGetFileHandle('g'));
DebugPrint('g', "\n");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -