📄 doubleint_tilingqit.m
字号:
function out = doubleint_tilingqit(cfg)
% Custom tiling Q-iteration for the double integrator
% Also includes attempts at tiling Q-i versions different from averaging projection,
% e.g. gradient-like, least-squares
% Inputs:
% CFG - structure with fields commented in the code below
% can also be given as a string, e.g.
% 'resume model=hier transdata=data1 datafile=data2'
% Outputs:
% OUT - either the Q-iteration statistics if it was run or resumed,
% or the trajectory history if only a replayed was performed
%
if nargin < 1, cfg = struct(); end;
% Default config
CFG.run = 1; % run learning
CFG.resume = 0; % resume learning
CFG.replay = 0; % replay learned policy
CFG.loadtiling = ''; % load tiling data from file
CFG.datafile = 'tqidata'; % save data to file
CFG.interph = 0; % interpolated (averaged) policy
% 'avg' for averaging
% 'sum' for sum of tile values instead of average
% 'grad' for gradient-like projection
% 'lsqr' for \ (mldivide) least-squares
% 'pinv' for pinv least-squares
% 'inv' for pinv-least squares with N = N0
CFG.method = 'avg';
CFG.maxiter = 500; % max number of iterations for Q-iteration
CFG.epsilon = .01; % threshold for convergence
CFG.x0 = []; % initial state for replay (default random)
CFG.tend = 30; % end time for replay
CFG.plottarget = 'screen'; % 'screen', '', 'latex', or 'beamer'
CFG.savetheta = 0; % save param history in stats NOT IMPLEMENTED
CFG.savedir = '';
CFG.savefig = '';
% Parse / set config
if ischar(cfg),
cfg = str2cfg(cfg, fieldnames(CFG));
end;
% install pb defasults
cfg = checkparams(cfg, feval('doubleint_problem', 'tiling'));
% install defaults where not specified
cfg = checkparams(cfg, CFG);
% echo
cfg
cfg.loaddata = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');
if cfg.loaddata, % load data file, making sure that cfg is not overwritten
cfg1 = cfg;
load(cfg.datafile);
cfg = cfg1; clear cfg1;
end;
% -------------------------
% Setup model, if not loaded one
if ~cfg.loaddata,
model = feval('doubleint_problem', 'model');
end;
% load tiling, trans, etc. data if given a tiling data file
if ~isempty(cfg.loadtiling),
load(cfg.loadtiling, 'xdata', 'theta', 'tdim', 'MDP', 'PHI', 'c', 'tcfg', 'grids', 'U0');
disp(['Tiling, transition, and activation data loaded from [' cfg.loadtiling '].']);
end;
% -------------------------
% Setup tiling and command discretization, if not loaded
if ~cfg.loaddata,
if isempty(cfg.loadtiling),
checkparams(cfg, [], {'xgrids', 'ugrids', 'tcfg'});
grids = cfg.xgrids; U0 = cfg.ugrids{1};
tcfg = cfg.tcfg; c = tcfg.c;
[xdata, theta, tdim] = tiling_create(grids, length(U0), c, tcfg);
else % reset params matrix
theta = tiling_reset(tdim, tcfg);
end;
% these get recomputed, they aren't loaded above
N = prod(tdim.tsize); % # of approximator params for one discrete component
% choose sample points - middle of each patch
xp = sort(xdata{1}(:)); xdotp = sort(xdata{2}(:));
% remove duplicates and NaNs
xp = xp(~isnan(xp)); xp = [xp(diff(xp) ~= 0); xp(end)];
xdotp = xdotp(~isnan(xdotp)); xdotp = [xdotp(diff(xdotp) ~= 0); xdotp(end)];
% pick a point in every patch (the center)
X0 = xp(1:end-1) + diff(xp)/2;
XDOT0 = xdotp(1:end-1) + diff(xdotp)/2;
X0DIM = [length(X0) length(XDOT0)];
N0 = prod(X0DIM); % number of state samples
M0 = length(U0); % number of command samples
M = M0;
DIM0 = [N0 M0];
end;
% *************************************************************************************
% Q-iteration
if cfg.run || cfg.resume,
% Q-iteration parameters on the global config
lp.maxiter = cfg.maxiter;
lp.eps = cfg.epsilon;
% if resuming, do not reinit learning
if ~cfg.resume,
% Q-iteration parameters
lp.gamma = .98;
qistats.delta = [];
qistats.t = 0;
% if cfg.saveq, qistats.Qhistory = cell(lp.maxiter+1, 1); end; % also leave room for initial Q
% if cfg.saveq, qistats.Qhistory{1} = theta; end;
% if given a transition data file, load transition data from there
if isempty(cfg.loadtiling),
disp('Creating MDP transition data and activation matrix...');
% init MDP structure
MDP.F = zeros([2, DIM0]); MDP.R = zeros(DIM0);
% init activation matrix
PHI = sparse(N, N0);
prog = .1; progstep = .1;
for i = 1:N0,
ii = lin2ndi(i, X0DIM); % indices in X0 and XDOT0
x = [X0(ii(1)); XDOT0(ii(2))];
for j = 1:M0,
% record MDP data -- next state and corresponding reward
[xplus, rplus] = doubleint_mdp(model, x, U0(j));
MDP.F(:, i, j) = xplus; MDP.R(i, j) = rplus;
end;
% record activated tiles (only retrieve for first discrete
% action as we don't care about the discrete components)
[tv ti] = tiling_get(xdata, theta, tdim, [x; 1]);
PHI(ti, i) = 1/c;
if i/N0 > prog, % progress feedback
disp([num2str(prog * 100) '% completed...']);
prog = prog + progstep;
end;
end;
disp('done.');
save(cfg.datafile);
disp(['Transition & activation data saved to [' cfg.datafile ']']);
end; % "load model data from" IF
end;
% compute projection matrix from activation matrix PHI
disp('Computing projection matrix');
switch cfg.method
% should be "averaging" on rows (rows normalized) if used in averaging QI
case {'avg', 'sum'}
nrm = sum(PHI, 2);
for i = 1:N, P(i, :) = PHI(i, :) / nrm(i); end;
% do nothing for gradient-like update, coefficients stay 1/c
case 'grad'
P = PHI;
% transpose for ease of applying \ (mldivide)
case 'lsqr'
P = PHI';
warning('off', 'MATLAB:rankDeficientMatrix');
% compute pseudoinverse for lsqr-pinv method
case 'pinv'
P = pinv(full(PHI)');
case 'inv'
% compute a full-row-rank matrix by removing linearly-dependent rows
% from activation matrix
% note the brute-force method below is very inefficient...
PHI = full(PHI);
r = rank(PHI);
bi = ones(N0, 1);
k = 1;
while (sum(bi) > r) && (k <= N0),
bi(k) = 0;
% check if element is needed for full-rank matrix
if rank(PHI(:, find(bi))) < r, bi(k) = 1;
% else leave it at 0
% disp([num2str(sum(bi)-r) ' samples to remove until full-rank']);
end;
k = k + 1;
end;
% select an additional set of sample points to get to N points
bi(find(~bi, N - r)) = 1;
XI0 = find(bi);
P = pinv(PHI(:, XI0)');
end;
% do QI
disp('Performing tiling Q-value iteration...');
% k unchanged when resuming, such that we never exceed maxiter
if ~cfg.resume, k = 1; end;
delta = inf;
t = cputime;
TF = zeros(N0, M0); % records the rhs of Bellman operator for all samples
while k <= lp.maxiter && delta > lp.eps, % main loop
% compute the rhs of approximate Bellman operator,
% [T(F(theta))](x, u) for all samples x, u
switch cfg.method,
case {'avg', 'lsqr', 'pinv', 'grad'} % F = 1/c sum tile_values
for i = 1:N0,
for j = 1:M0,
TF(i, j) = MDP.R(i, j) + lp.gamma * ...
max( tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
end;
end;
case 'sum' % F = sum tile_values
for i = 1:N0,
for j = 1:M0,
TF(i, j) = MDP.R(i, j) + lp.gamma * ...
max( c*tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
end;
end;
case 'inv' % only for subset XI0 of samples (N in number)
for i = 1:XI0,
for j = 1:M0,
TF(i, j) = MDP.R(i, j) + lp.gamma * ...
max( tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
end;
end;
end;
% update weight data
thetaold = theta;
switch cfg.method,
case {'avg','pinv','sum','grad'}
theta = P * TF;
case 'lsqr'
% solution via left-inverse
theta = P \ TF;
% solution via lsqr (needs to be applied discrete element-wise)
% for j = 1:M0,
% theta(:, j) = lsqr(P, TF(:, j));
% end;
case 'inv'
theta = P * TF(XI0, :);
end;
% compute infinity-norm of function (not matrix)
delta = max(max(abs(theta - thetaold)));
% update stats
qistats.t = qistats.t + (cputime - t);
qistats.delta(end + 1) = delta;
% if cfg.saveq, qistats.Qhistory{k+1} = theta; end;
% visual feedback of algorithm progress
if ~mod(k, 10),
disp(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)]);
end;
% data backup
if ~mod(k, 40),
save(cfg.datafile);
disp(['Intermediary data saved to [' cfg.datafile '].']);
end;
t = cputime;
k = k + 1;
end; % while not converged
save(cfg.datafile); disp(['Intermediary data saved to [' cfg.datafile '].']);
end;
% *************************************************************************************
% *************************************************************************************
% ----------------------------
% Replay
if cfg.replay,
% compute locally optimal policy (local optimal discrete action for each basis
% function/tile)
if cfg.interph
[Qstar hstar] = max(theta, [], 2); clear Qstar;
hstar = U0(hstar);
end;
% config
lp.Tfin = cfg.tend;
lp.maxsteps = round(lp.Tfin / model.Ts);
% initial state
if ~isempty(cfg.x0), % force reset state
x0 = cfg.x0(:);
else % reset to random
x0 = (2*rand(2, 1)) .* phys.maxx -phys.maxx;
end;
% history
t = 0 : model.Ts : lp.Tfin;
x = zeros(2, length(t)); x(:, 1) = x0;
u = zeros(1, length(t)); u(end) = NaN;
r = zeros(1, length(t)); r(1) = NaN;
% steps loop
for k = 1:lp.maxsteps,
if cfg.interph, % average between local maxima corresponding to each tile
[Qstar istar] = tiling_get(xdata, theta, tdim, x(:, k));
u(k) = mean(hstar(istar));
else
[Qstar jstar] = max( tiling_get(xdata, theta, tdim, x(:, k)) );
u(k) = U0(jstar);
end;
% apply to system
[x(:, k+1), r(k+1)] = doubleint_mdp(model, x(:, k), u(k));
end;
% plot history
h.t = t; h.x = x; h.u = u; h.r = r;
toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget);
figh = plothistory(h);
saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget);
% if the model supplies a plot function, use it to plot the trajectory
if isfield(model, 'plotfun'),
figh(end+1) = feval(model.plotfun, hist);
end;
% close the figures if their target was not the screen
if ~toscreen, close(figh); end;
end; % end REPLAY if
if cfg.run || cfg.resume,
% save the data from this run
% if default file name is used, generate a unique file name
if strcmp(cfg.datafile, 'tqitdata'),
c = fix(clock);
for i = 1:length(c),
cfg.datafile = [cfg.datafile '_' num2str(c(i), '%02d')];
end;
delete('tqitdata.mat');
end;
save(cfg.datafile);
disp(['Tiling Q-iteration finished. Data was saved to [' cfg.datafile '].']);
end;
% set output
if cfg.run || cfg.resume,
out = qistats;
elseif cfg.replay
out = h;
end;
% END centrfzrl() ================================================================================
% Code to perform PF in qiteration - to verify whether for tiling theta = PF theta for any
% theta (which is by the way not true, as code below showed)
% ii = lin2ndi(i, X0DIM);
% TF(i, j) = tiling_get(xdata, theta, tdim, [X0(ii(1)); XDOT0(ii(2)); j]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -