📄 cpolicies.cpp
字号:
// Copyright (C) 2003
// Gerhard Neumann (gerhard@igi.tu-graz.ac.at)
//
// This file is part of RL Toolbox.
// http://www.igi.tugraz.at/ril_toolbox
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ril_debug.h"
#include "cpolicies.h"
#include "cutility.h"
#include "ctheoreticalmodel.h"
#include <assert.h>
#include <math.h>
CQGreedyPolicy::CQGreedyPolicy(CActionSet *actions, CAbstractQFunction *qFunction) : CAgentController(actions)
{
this->qFunction = qFunction;
}
CAction *CQGreedyPolicy::getNextAction(CStateCollection *state, CActionDataSet *data )
{
return qFunction->getMax(state, actions, data);
}
CSoftMaxDistribution::CSoftMaxDistribution(rlt_real beta)
{
addParameter("SoftMaxBeta", beta);
}
void CSoftMaxDistribution::getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values)
{
rlt_real sum = 0.0;
rlt_real beta = getParameter("SoftMaxBeta");
unsigned int i;
unsigned int numValues = availableActions->size();
rlt_real minValue = values[0];
rlt_real maxValue = values[0];
for (i = 1; i < numValues; i++)
{
if (minValue > values[i])
{
minValue = values[i];
}
if (maxValue < values[i])
{
maxValue = values[i];
}
}
if (beta * (maxValue - minValue) > MAX_EXP)
{
beta = MAX_EXP / (maxValue - minValue);
}
for (i = 0; i < numValues; i++)
{
values[i] = exp(beta * (values[i] - minValue));
sum += values[i];
}
assert(sum > 0);
for (i = 0; i < numValues; i++)
{
values[i] = values[i] / sum;
assert(values[i] >= 0 && values[i] <= 1);
}
}
void CSoftMaxDistribution::getGradientFactors(CStateCollection *state, CAction *usedAction, CActionSet *availableActions, rlt_real *actionValues, CMyVector *factors)
{
int numValues = availableActions->size();
rlt_real normTerm = 0.0;
rlt_real beta = getParameter("SoftMaxBeta");
int actIndex = availableActions->getIndex(usedAction);
rlt_real minValue = actionValues[0];
rlt_real maxValue = actionValues[0];
DebugPrint('p', "SoftMax Gradient Factors:\n");
for (int i = 0; i < numValues; i++)
{
if (minValue > actionValues[i])
{
minValue = actionValues[i];
}
if (maxValue < actionValues[i])
{
maxValue = actionValues[i];
}
DebugPrint('p', "%f ", actionValues[i]);
}
DebugPrint('p', "\n");
if (beta * (maxValue - minValue) > 200)
{
beta = 200 / (maxValue - minValue);
}
for (int i = 0; i < numValues; i++)
{
normTerm += exp(beta * (actionValues[i] - minValue));
}
rlt_real buf = exp(beta * (actionValues[actIndex] - minValue));
DebugPrint('p', "Beta:%f\n", normTerm);
for (int i = 0; i < numValues; i ++)
{
factors->setElement(i, - beta * buf * exp(beta * (actionValues[i] - minValue)) / pow(normTerm, (rlt_real)2.0));
}
factors->setElement(actIndex, factors->getElement(actIndex) + buf * beta / normTerm);
DebugPrint('p', "SoftMax Gradient Factors:\n");
for (int i = 0; i < numValues; i ++)
{
DebugPrint('p', "%f ", factors->getElement(i));
}
DebugPrint('p', "\n");
}
CAbsoluteSoftMaxDistribution::CAbsoluteSoftMaxDistribution(rlt_real absoluteValue)
{
addParameter("SoftMaxAbsoluteValue", absoluteValue);
}
void CAbsoluteSoftMaxDistribution::getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *values)
{
rlt_real sum = 0.0;
rlt_real absoluteValue = getParameter("SoftMaxAbsoluteValue");
unsigned int i;
unsigned int numValues = availableActions->size();
rlt_real beta = 0.0;
rlt_real minValue = values[0];
rlt_real maxValue = values[0];
for (i = 1; i < numValues; i++)
{
if (minValue > values[i])
{
minValue = values[i];
}
if (maxValue < values[i])
{
maxValue = values[i];
}
}
if ((fabs(maxValue) <= 0.0000001 && fabs(minValue) <= 0.0000001))
{
beta = 100;
}
else
{
if (fabs(maxValue) < fabs(minValue) )
{
beta = absoluteValue / (fabs(minValue));
}
else
{
beta = absoluteValue / (fabs(maxValue));
}
}
if (beta * fabs((maxValue - minValue)) > 400)
{
beta = 400 / fabs(maxValue - minValue);
}
for (i = 0; i < numValues; i++)
{
values[i] = exp(beta * (values[i] - minValue));
sum += values[i];
}
assert(sum > 0);
for (i = 0; i < numValues; i++)
{
values[i] = values[i] / sum;
assert(values[i] >= 0 && values[i] <= 1);
}
}
void CGreedyDistribution::getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *actionValues)
{
rlt_real max = actionValues[0];
int maxIndex = 0;
unsigned int numValues = availableActions->size();
actionValues[0] = 0.0;
for (unsigned int i = 1; i < numValues; i++)
{
if (actionValues[i] > max)
{
max = actionValues[i];
maxIndex = i;
}
actionValues[i] = 0.0;
}
actionValues[maxIndex] = 1.0;
}
CEpsilonGreedyDistribution::CEpsilonGreedyDistribution(rlt_real epsilon)
{
addParameter("EpsilonGreedy", epsilon);
}
void CEpsilonGreedyDistribution::getDistribution(CStateCollection *state, CActionSet *availableActions, rlt_real *actionValues)
{
unsigned int numValues = availableActions->size();
rlt_real epsilon = getParameter("EpsilonGreedy");
rlt_real prop = epsilon / numValues;
rlt_real max = actionValues[0];
int maxIndex = 0;
for (unsigned int i = 0; i < numValues; i++)
{
if (actionValues[i] > max)
{
max = actionValues[i];
maxIndex = i;
}
actionValues[i] = prop;
}
actionValues[maxIndex] += 1 - epsilon;
}
CStochasticPolicy::CStochasticPolicy(CActionSet *actions, CActionDistribution *distribution) : CAgentStatisticController(actions)
{
actionValues = new rlt_real[actions->size()];
this->distribution = distribution;
addParameters(distribution);
gradientFactors = new CMyVector(actions->size());
actionGradientFeatures = new CFeatureList();
}
CStochasticPolicy::~CStochasticPolicy()
{
delete [] actionValues;
delete gradientFactors;
delete actionGradientFeatures;
}
void CStochasticPolicy::getActionProbabilities(CStateCollection *state, CActionSet *availableActions, rlt_real *actionValues, CActionDataSet *actionDataSet)
{
getActionValues(state, availableActions, actionValues, actionDataSet);
distribution->getDistribution(state, availableActions, actionValues);
}
CAction *CStochasticPolicy::getNextAction(CStateCollection *state, CActionDataSet *dataSet, CActionStatistics *stat)
{
CActionSet *availableActions = new CActionSet();
getActions()->getAvailableActions(availableActions, state);
assert(availableActions->size() > 0);
getActionProbabilities(state, availableActions, actionValues, dataSet);
rlt_real sum = actionValues[0];
CActionSet::iterator it = availableActions->begin();
rlt_real z = (rlt_real) rand() / (RAND_MAX);
unsigned int i = 0;
while (sum <= z && i < availableActions->size() - 1)
{
i++;
it++;
sum += actionValues[i];
}
if (stat != NULL)
{
stat->owner = this;
getActionStatistics(state, (*it), stat);
}
DebugPrint('p', "ActionPropabilities: ");
for (unsigned int j = 0; j < availableActions->size(); j++)
{
DebugPrint('p', "%f ", actionValues[j]);
}
DebugPrint('p', "\nChoosed Action: %d\n", actions->getIndex(*it));
CAction *action = *it;
delete availableActions;
return action;
}
void CStochasticPolicy::getActionProbabilityGradient(CStateCollection *state, CAction *action, CActionData *data, CFeatureList *gradientState)
{
gradientState->clear();
if (isDifferentiable())
{
getActionValues(state,actions, this->actionValues);
distribution->getGradientFactors(state, action, actions, actionValues, gradientFactors);
CActionSet::iterator it = actions->begin();
for (int j = 0;it != actions->end(); it ++, j++)
{
actionGradientFeatures->clear();
getActionGradient(state, *it, NULL, actionGradientFeatures);
CFeatureList::iterator itFeat = actionGradientFeatures->begin();
for (; itFeat != actionGradientFeatures->end(); itFeat++)
{
gradientState->update((*itFeat)->featureIndex, (*itFeat)->factor * gradientFactors->getElement(j));
}
}
}
if (DebugIsEnabled('p'))
{
DebugPrint('p', "Policy Gradient Factors:\n");
gradientState->saveASCII(DebugGetFileHandle('p'));
DebugPrint('p', "\n");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -