tilingqi.m
来自「approximate reinforcement learning」· M 代码 · 共 369 行
M
369 行
function out = tilingqi(cfg)
% Approximate offline, model-based Q-iteration with tile-coding approximation
% OUT = TILINGQI(CFG)
% Inputs:
% CFG - structure with fields as commented in the code below
% can also be given as a string, see help str2cfg
% Outputs:
% OUT - either the Q-iteration statistics if the algorithm was run or resumed,
% or the control history if only a replay was performed
%
if nargin < 1, cfg = struct(); end;
% -----------------------------------------------
% Process configuration
% default config
% script config
CFG.run = 0; % run learning
CFG.resume = 0; % resume learning
CFG.replay = 0; % replay learned policy
CFG.problem = ''; % what problem to solve
CFG.loadapprox = ''; % load approximator data from file
CFG.datafile = 'tqidata'; % save data to file
% learning config
CFG.gamma = 0.98; % discount factor
CFG.eps = .1; % threshold for convergence
CFG.maxiter = 300; % max number of iterations for phi-iteration
CFG.serial = 1; % serial or parallel Q-iteration
CFG.storeact = -1; % whether to store activation (membership) vectors (-1 for auto)
% replay config
CFG.interph = 1; % interpolated (averaged) policy
CFG.x0 = []; % initial state for replay (empty for default)
CFG.tend = 30; % end time for replay
% stats & figure output config
CFG.plottarget = 'screen'; % 'screen', '', 'latex', or 'beamer'
CFG.savetheta = 0; % save param history in stats NOT IMPLEMENTED
CFG.savedir = '';
CFG.savefig = '';
% display config
CFG.verb = 5; % verbosity: the higher, the more detailed the messages displayed
CFG.noplot = 0; % whether to suppress figure plots
CFG.silent = 0; % suppress all output
CFG.initstepdisp = .1; % feedback every 10% of MDP init
CFG.iterdisp = 10; % feedback after every 10 iteration
CFG.itersave = 25; % save after every 25 iterations
% If caller provided string, parse it into a structure
if ischar(cfg),
cfg = str2cfg(cfg, fieldnames(CFG));
end;
% Try installing problem defaults whenever caller does not specify values
% Problem must be specified for this to work
try
if ~isempty(cfg.problem), cfg = checkparams(cfg, feval(cfg.problem, 'tiling')); end;
catch % do nothing
end;
% Install function defaults for everything else
cfg = checkparams(cfg, CFG);
% Process configuration dependencies
if cfg.silent, cfg.verb = -Inf; cfg.noplot = 1; end;
% Echo config
dispx('Tile-coding Q-iteration called with the following configuration:', cfg.verb, 1);
dispx(cfg, cfg.verb, 1);
cfg.noinit = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');
if cfg.noinit, % load data file, making sure that cfg is not overwritten
cfg1 = cfg;
load(cfg.datafile);
cfg = cfg1; clear cfg1;
end;
% -----------------------------------------------
% Setup model
if ~cfg.noinit,
model = feval(cfg.problem, 'model');
end;
% load tiling, trans, etc. data if given a tiling data file
if ~isempty(cfg.loadapprox),
load(cfg.loadapprox, 'X', 'X0', 'U', 'DIMS', 'XDATA', 'TDIM', 'MDP', 'P', 'tcfg');
dispx(['Initialization (tiling, transition, and projection data) loaded from [' cfg.loadapprox '].'], cfg.verb, 1);
end;
% -----------------------------------------------
% Setup tiling and command discretization, if not loaded
if ~cfg.noinit && isempty(cfg.loadapprox),
checkparams(cfg, [], {'xgrids', 'ugrids', 'tcfg'});
X = cfg.xgrids; U = cfg.ugrids; tcfg = cfg.tcfg;
DIMS.p = length(X); DIMS.q = length(U); % # of states and outputs
DIMS.dimx = []; DIMS.dimu = []; % dimensions of grids
for p = 1:DIMS.p, DIMS.dimx(end+1) = length(X{p}); end;
for q = 1:DIMS.q, DIMS.dimu(end+1) = length(U{q}); end;
% create tiling
[XDATA, theta, TDIM] = tiling_create(X, DIMS.dimu, tcfg.c, tcfg);
clear tdata;
% # of parameters
DIMS.N = TDIM.c * prod(DIMS.dimx); % number of tilings x number of grid hypercubes
DIMS.M = prod(DIMS.dimu);
% choose sample points as center points of all hypercubes formed by intersections
% of all tiling surfaces
X0 = cell(DIMS.p, 1);
DIMS.dimx0 = [];
for p = 1:DIMS.p,
xp = sort(XDATA{p}(:));
xp = xp(~isnan(xp)); xp = [xp(diff(xp) ~= 0); xp(end)]; % remove duplicates and NaNs
X0{p} = xp(1:end-1) + diff(xp)/2; % pick center of every interval
DIMS.dimx0(end+1) = length(X0{p});
end;
DIMS.N0 = prod(DIMS.dimx0); % total number of state samples
end;
% -----------------------------------------------
% Q-iteration
if cfg.run || cfg.resume,
% -----------------------------------------------
% MDP transition data, if not resuming and not given a transition data file
if ~cfg.resume && isempty(cfg.loadapprox),
if cfg.storeact < 0, % auto
% decide heuristically whether to store activations of next states:
% if number of elements to store < 5000, or the number of elements is
% at most 5 times greater a plain transition table would require
cfg.storeact = ((DIMS.N0 * DIMS.M * TDIM.c)) < 5000 || (TDIM.c <= 5*DIMS.p);
end;
if cfg.storeact, dispx('Computing MDP, sample, and next state activation data...', cfg.verb, 0);
else dispx('Computing MDP and sample activation data...', cfg.verb, 0);
end;
t = cputime;
if cfg.storeact, MDP.F = zeros(DIMS.N0, DIMS.M, TDIM.c);
else MDP.F = zeros(DIMS.N0, DIMS.M, DIMS.p);
end;
% init MDP structure and activation matrix
MDP.R = zeros(DIMS.N0, DIMS.M);
PHI = sparse(DIMS.N, DIMS.N0);
% iterate over (xi, uj) \in (X0, U0) and populate MDP structures
x = zeros(DIMS.p, 1); u = zeros(DIMS.q, 1);
prog = cfg.initstepdisp;
for i = 1:DIMS.N0,
% compute linear indices and construct state
ii = lin2ndi(i, DIMS.dimx0);
for p = 1:DIMS.p, x(p) = X0{p}(ii(p)); end;
% record activated tiles in activation matrix (only retrieve for first discrete
% action as we don't care about the discrete components)
[tv ti] = tiling_get(XDATA, theta, TDIM, [x; 1]);
PHI(ti, i) = 1/TDIM.c;
% process next state for each action
for j = 1:DIMS.M,
% compute linear indices and construct input
jj = lin2ndi(j, DIMS.dimu);
for q = 1:DIMS.q, u(q) = U{q}(jj(q)); end;
[xplus, rplus] = feval(model.fun, model, x, u);
% store either activated tiles for xplus, or xplus itself
if cfg.storeact,
[tv MDP.F(i, j, :)] = tiling_get(XDATA, theta, TDIM, [xplus; 1]);
else
MDP.F(i, j, :) = xplus;
end;
MDP.R(i, j) = rplus;
end;
if i/DIMS.N0 > prog, % progress feedback
dispx([num2str(prog * 100) '% completed...'], cfg.verb, 2);
prog = prog + cfg.initstepdisp;
end;
end; % FOR over state samples
dispx('done.', cfg.verb, 2);
clear tv; % unneeded variable
% compute projection matrix from activation matrix PHI
dispx('Computing projection matrix', cfg.verb, 1);
nrm = sum(PHI, 2); P = sparse(DIMS.N, DIMS.N0);
for i = 1:DIMS.N, P(i, :) = PHI(i, :) / nrm(i); end;
% activation matrix no longer needed
clear PHI;
% record how much time MDP data computation took (disregarding that progress display is
% also counted here)
qistats.tinit = cputime - t;
save(cfg.datafile);
dispx(['Initialization data saved to [' cfg.datafile ']'], cfg.verb, 1);
end; % IF initialize MDP etc.
% -----------------------------------------------
% Do Q-iteration
dispx('Performing tiling Q-value iteration...', cfg.verb, 0);
% Q-iteration parameters: cfg.eps, cfg.maxiter, cfg.gamma
% init param vector and iteration index when not resuming
if ~cfg.resume,
qistats.delta = [];
qistats.t = 0;
dispx('Initializing approximator parameters', cfg.verb, 2);
theta = tiling_reset(TDIM, tcfg);
k = 1;
end;
delta = inf;
t = cputime;
TF = zeros(DIMS.N0, DIMS.M); % records the rhs of Bellman operator for all samples
while k <= cfg.maxiter && delta > cfg.eps, % main loop
% compute the rhs of approximate Bellman operator,
% [T(F(theta))](xi, uj) for all samples xi, uj
if cfg.storeact, % use the stored indices of the activated tiles
for i = 1:DIMS.N0,
for j = 1:DIMS.M,
TF(i, j) = MDP.R(i, j) + cfg.gamma * max( mean(theta(MDP.F(i, j, :), :)) );
end;
end;
else
for i = 1:DIMS.N0,
for j = 1:DIMS.M,
TF(i, j) = MDP.R(i, j) + cfg.gamma * ...
max( tiling_get(XDATA, theta, TDIM, MDP.F(i, j, :)) );
end;
end;
end;
% update weight data
thetaold = theta;
theta = P * TF; % project TF into theta using P
delta = max(max(abs(theta - thetaold)));
% update stats
qistats.t = qistats.t + (cputime - t);
qistats.delta(end + 1) = delta;
% visual feedback of algorithm progress
if ~mod(k, cfg.iterdisp),
dispx(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)], cfg.verb, 2);
end;
% data backup
if ~mod(k, cfg.itersave),
save(cfg.datafile);
dispx(['Intermediary data at k=' num2str(k) ' saved to [' cfg.datafile '].'], cfg.verb, 1);
end;
t = cputime;
k = k + 1;
end; % while not converged
% remove unneeded large arrays
clear TF;
if delta < cfg.eps, dispx('Convergence detected. Algorithm stopped.', cfg.verb, 0);
else dispx(['maxiter=' num2str(cfg.maxiter) ' exhausted. Algorithm stopped'], cfg.verb, 0);
end;
end;
% -----------------------------------------------
% Replay
if cfg.replay,
% compute locally optimal policy (local optimal discrete action for each basis
% function/tile)
if cfg.interph
[Qstar ui] = max(theta, [], 2); clear Qstar;
ui = lin2ndi(ui, DIMS.dimu);
hstar = zeros(DIMS.N, DIMS.q);
for q = 1:DIMS.q,
hstar(:, q) = U{q}(ui(:, q));
end;
end;
% initial state
if ~isempty(cfg.x0), % specified initial state
x0 = cfg.x0(:);
else % zeros
x0 = zeros(p, 1);
end;
dispx(['Controlling from x0=' num2str(reshape(x0, 1, [])) ], cfg.verb, 1);
% history
t = 0 : model.Ts : cfg.tend;
x = zeros(DIMS.p, length(t)); x(:, 1) = x0;
u = zeros(DIMS.q, length(t)); u(:, end) = NaN;
r = zeros(1, length(t)); r(1) = NaN;
% steps loop
for k = 1:length(t)-1,
% compute optimal action
[Qstar istar] = tiling_get(XDATA, theta, TDIM, x(:, k));
if cfg.interph, % average between local maxima corresponding to each tile
u(:, k) = mean(hstar(istar, :));
else % crisp selection using the approximate Q-values of all u in U
ui = find(Qstar == max(Qstar)); ui = ui(ceil(rand * length(ui)));
ui = lin2ndi(ui, DIMS.dimu);
for q = 1:DIMS.q,
u(q, k) = U{q}(ui(:, q));
end;
end;
% apply to system
[x(:, k+1) r(k+1)] = feval(model.fun, model, x(:, k), u(:, k));
end;
% plot history & optionally save figures
hist.t = t; hist.x = x; hist.u = u; hist.r = r;
if ~cfg.noplot,
toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget);
figh = plothistory(hist);
saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget);
% if the model supplies a plot function, use it to plot the trajectory
if isfield(model, 'plotfun'),
figh(end+1) = feval(model.plotfun, hist);
end;
% pause;
% close the figures if their target was not the screen
if ~toscreen, close(figh); end;
end;
end; % IF replay
% -----------------------------------------------
% Backup data
if cfg.run || cfg.resume,
% save into the same directory as the problem
savedir = fileparts(which(cfg.problem));
% if default file name is used, generate a unique file name
if strcmp(cfg.datafile, CFG.datafile),
c = fix(clock);
for i = 1:length(c),
cfg.datafile = [cfg.datafile '-' num2str(c(i), '%02d')];
end;
delete([CFG.datafile '.mat']);
elseif ~strcmp(savedir, pwd)
delete([cfg.datafile '.mat']); % need to save in a different directory, so delete anyway
end; % otherwise just overwrite
cfg.datafile = [savedir '\' cfg.datafile];
save(cfg.datafile);
dispx(['Tile-coding Q-iteration finished. Data was saved to [' cfg.datafile '].'], cfg.verb, 1);
end;
% -----------------------------------------------
% set output
if cfg.run || cfg.resume,
out = qistats;
elseif cfg.replay % output history
out = hist;
end;
% END tilingqi() RETURNING out =================================================================
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?