📄 doubleint_tilingqit.m

📁 approximate reinforcement learning
💻 M
字号:
function out = doubleint_tilingqit(cfg)
% Custom tiling Q-iteration for the double integrator
% Also includes attempts at tiling Q-i versions different from averaging projection, 
% e.g. gradient-like, least-squares
% Inputs:
%   CFG     - structure with fields commented in the code below
%           can also be given as a string, e.g.
%           'resume model=hier transdata=data1 datafile=data2'
% Outputs:
%   OUT     - either the Q-iteration statistics if it was run or resumed,
%           or the trajectory history if only a replayed was performed
% 
if nargin < 1, cfg = struct(); end;

% Default config
CFG.run = 1;                        % run learning
CFG.resume = 0;                     % resume learning
CFG.replay = 0;                     % replay learned policy
CFG.loadtiling = '';                % load tiling data from file
CFG.datafile = 'tqidata';           % save data to file
CFG.interph = 0;                    % interpolated (averaged) policy
% 'avg' for averaging
% 'sum' for sum of tile values instead of average
% 'grad' for gradient-like projection
% 'lsqr' for \ (mldivide) least-squares
% 'pinv' for pinv least-squares
% 'inv' for pinv-least squares with N = N0
CFG.method = 'avg';                 

CFG.maxiter = 500;                  % max number of iterations for Q-iteration
CFG.epsilon = .01;                   % threshold for convergence
CFG.x0 = [];                        % initial state for replay (default random)
CFG.tend = 30;                       % end time for replay

CFG.plottarget = 'screen';          % 'screen', '', 'latex', or 'beamer'
CFG.savetheta = 0;                  % save param history in stats NOT IMPLEMENTED
CFG.savedir = '';
CFG.savefig = '';

% Parse / set config
if ischar(cfg),
    cfg = str2cfg(cfg, fieldnames(CFG));
end;
% install pb defasults
cfg = checkparams(cfg, feval('doubleint_problem', 'tiling'));
% install defaults where not specified
cfg = checkparams(cfg, CFG);
% echo
cfg

cfg.loaddata = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');

if cfg.loaddata,        % load data file, making sure that cfg is not overwritten
    cfg1 = cfg;
    load(cfg.datafile);
    cfg = cfg1; clear cfg1;
end;

% -------------------------
% Setup model, if not loaded one
if ~cfg.loaddata,
    model = feval('doubleint_problem', 'model');
end;

% load tiling, trans, etc. data if given a tiling data file
if ~isempty(cfg.loadtiling),
    load(cfg.loadtiling, 'xdata', 'theta', 'tdim', 'MDP', 'PHI', 'c', 'tcfg', 'grids', 'U0');
    disp(['Tiling, transition, and activation data loaded from [' cfg.loadtiling '].']);
end;


% -------------------------
% Setup tiling and command discretization, if not loaded
if ~cfg.loaddata,
    if isempty(cfg.loadtiling),
        checkparams(cfg, [], {'xgrids', 'ugrids', 'tcfg'});
        grids = cfg.xgrids; U0 = cfg.ugrids{1}; 
        tcfg = cfg.tcfg; c = tcfg.c;
        [xdata, theta, tdim] = tiling_create(grids, length(U0), c, tcfg);
    else        % reset params matrix
        theta = tiling_reset(tdim, tcfg);
    end;
    
    % these get recomputed, they aren't loaded above
    N = prod(tdim.tsize);      % # of approximator params for one discrete component

    % choose sample points - middle of each patch
    xp = sort(xdata{1}(:)); xdotp = sort(xdata{2}(:));
    % remove duplicates and NaNs
    xp = xp(~isnan(xp)); xp = [xp(diff(xp) ~= 0); xp(end)];
    xdotp = xdotp(~isnan(xdotp)); xdotp = [xdotp(diff(xdotp) ~= 0); xdotp(end)];
    
    % pick a point in every patch (the center)
    X0 = xp(1:end-1) + diff(xp)/2;
    XDOT0 = xdotp(1:end-1) + diff(xdotp)/2;
    X0DIM = [length(X0) length(XDOT0)];
    N0 = prod(X0DIM);     % number of state samples
    
    M0 = length(U0);     % number of command samples
    M = M0;
    DIM0 = [N0 M0];
end;


% *************************************************************************************
% Q-iteration
if cfg.run || cfg.resume,
    
    % Q-iteration parameters on the global config
    lp.maxiter = cfg.maxiter;
    lp.eps = cfg.epsilon;

    % if resuming, do not reinit learning
    if ~cfg.resume,

        % Q-iteration parameters
        lp.gamma = .98;
        qistats.delta = [];
        qistats.t = 0;
%         if cfg.saveq, qistats.Qhistory = cell(lp.maxiter+1, 1); end;   % also leave room for initial Q
%         if cfg.saveq, qistats.Qhistory{1} = theta; end;

        % if given a transition data file, load transition data from there
        if isempty(cfg.loadtiling),
            disp('Creating MDP transition data and activation matrix...');
            
            % init MDP structure
            MDP.F = zeros([2, DIM0]); MDP.R = zeros(DIM0);
            % init activation matrix
            PHI = sparse(N, N0);

            prog = .1; progstep = .1;
            for i = 1:N0,
                ii = lin2ndi(i, X0DIM);        % indices in X0 and XDOT0
                x = [X0(ii(1)); XDOT0(ii(2))];
                for j = 1:M0,
                    % record MDP data -- next state and corresponding reward
                    [xplus, rplus] = doubleint_mdp(model, x, U0(j));
                    MDP.F(:, i, j) = xplus; MDP.R(i, j) = rplus;
                end;
                
                % record activated tiles (only retrieve for first discrete
                % action as we don't care about the discrete components)
                [tv ti] = tiling_get(xdata, theta, tdim, [x; 1]);
                PHI(ti, i) = 1/c;
                
                if i/N0 > prog,     % progress feedback
                    disp([num2str(prog * 100) '% completed...']);
                    prog = prog + progstep;
                end;
            end;

            disp('done.');
            save(cfg.datafile);
            disp(['Transition & activation data saved to [' cfg.datafile ']']);
        end;        % "load model data from" IF
    end;

    % compute projection matrix from activation matrix PHI
    disp('Computing projection matrix');
    switch cfg.method
        % should be "averaging" on rows (rows normalized) if used in averaging QI
        case {'avg', 'sum'}
            nrm = sum(PHI, 2);
            for i = 1:N, P(i, :) = PHI(i, :) / nrm(i); end;
        % do nothing for gradient-like update, coefficients stay 1/c
        case 'grad'
            P = PHI;
        % transpose for ease of applying \ (mldivide)
        case 'lsqr'
            P = PHI';
            warning('off', 'MATLAB:rankDeficientMatrix');
        % compute pseudoinverse for lsqr-pinv method
        case 'pinv'
            P = pinv(full(PHI)');
        case 'inv'            
            % compute a full-row-rank matrix by removing linearly-dependent rows 
            % from activation matrix
            % note the brute-force method below is very inefficient...
            PHI = full(PHI);
            r = rank(PHI);
            bi = ones(N0, 1);
            k = 1;
            while (sum(bi) > r) && (k <= N0),
                bi(k) = 0;
                % check if element is needed for full-rank matrix
                if rank(PHI(:, find(bi))) < r, bi(k) = 1;
                    % else leave it at 0
%                     disp([num2str(sum(bi)-r) ' samples to remove until full-rank']);
                end;
                k = k + 1;
            end;
            
            % select an additional set of sample points to get to N points
            bi(find(~bi, N - r)) = 1;
            
            XI0 = find(bi);
            P = pinv(PHI(:, XI0)');
    end;

    
    % do QI
    disp('Performing tiling Q-value iteration...');
    % k unchanged when resuming, such that we never exceed maxiter
    if ~cfg.resume, k = 1; end;
    delta = inf;
    t = cputime;
    TF = zeros(N0, M0);   % records the rhs of Bellman operator for all samples
    while k <= lp.maxiter && delta > lp.eps,       % main loop
        
        % compute the rhs of approximate Bellman operator, 
        % [T(F(theta))](x, u) for all samples x, u
        switch cfg.method,
            case {'avg', 'lsqr', 'pinv', 'grad'}        % F = 1/c sum tile_values
                for i = 1:N0, 
                    for j = 1:M0,
                        TF(i, j) = MDP.R(i, j) + lp.gamma * ...
                            max( tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
                    end; 
                end;
            case 'sum'                          % F = sum tile_values
                for i = 1:N0, 
                    for j = 1:M0,
                        TF(i, j) = MDP.R(i, j) + lp.gamma * ...
                            max( c*tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
                    end; 
                end;
            case 'inv'                          % only for subset XI0 of samples (N in number)
                for i = 1:XI0, 
                    for j = 1:M0,
                        TF(i, j) = MDP.R(i, j) + lp.gamma * ...
                            max( tiling_get(xdata, theta, tdim, MDP.F(:, i, j)) );
                    end; 
                end;
        end;
                

        % update weight data
        thetaold = theta;
        switch cfg.method,
            case {'avg','pinv','sum','grad'}
                theta = P * TF;
            case 'lsqr'
                % solution via left-inverse
                theta = P \ TF;
                % solution via lsqr (needs to be applied discrete element-wise)
%                 for j = 1:M0,
%                     theta(:, j) = lsqr(P, TF(:, j));
%                 end;
            case 'inv'
                theta = P * TF(XI0, :);
        end;
        
        % compute infinity-norm of function (not matrix)
        delta = max(max(abs(theta - thetaold)));        

        % update stats
        qistats.t = qistats.t + (cputime - t);
        qistats.delta(end + 1) = delta;
%         if cfg.saveq, qistats.Qhistory{k+1} = theta; end;
        
        % visual feedback of algorithm progress
        if ~mod(k, 10), 
            disp(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)]);
        end;
        % data backup
        if ~mod(k, 40),
            save(cfg.datafile);
            disp(['Intermediary data saved to [' cfg.datafile '].']);
        end;
        
        t = cputime;
        k = k + 1;
    end;        % while not converged

    save(cfg.datafile); disp(['Intermediary data saved to [' cfg.datafile '].']);
    
end;

% *************************************************************************************
% *************************************************************************************


% ----------------------------
% Replay
if cfg.replay,

    % compute locally optimal policy (local optimal discrete action for each basis
    % function/tile)
    if cfg.interph
        [Qstar hstar] = max(theta, [], 2); clear Qstar;
        hstar = U0(hstar);
    end;
        
    % config
    lp.Tfin = cfg.tend;
    lp.maxsteps = round(lp.Tfin / model.Ts);
    % initial state
    if ~isempty(cfg.x0),      % force reset state
        x0 = cfg.x0(:);
    else                      % reset to random
        x0 = (2*rand(2, 1)) .* phys.maxx -phys.maxx;
    end;

    % history
    t = 0 : model.Ts : lp.Tfin;
    x = zeros(2, length(t)); x(:, 1) = x0;
    u = zeros(1, length(t)); u(end) = NaN;
    r = zeros(1, length(t)); r(1) = NaN;

    % steps loop
    for k = 1:lp.maxsteps,
        if cfg.interph,     % average between local maxima corresponding to each tile
            [Qstar istar] = tiling_get(xdata, theta, tdim, x(:, k));
            u(k) = mean(hstar(istar));
        else
            [Qstar jstar] = max( tiling_get(xdata, theta, tdim, x(:, k)) );
            u(k) = U0(jstar);
        end;
        % apply to system
        [x(:, k+1), r(k+1)] = doubleint_mdp(model, x(:, k), u(k));
    end;

    % plot history
    h.t = t; h.x = x; h.u = u; h.r = r;
    toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget);
    figh = plothistory(h);
    saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget);
    % if the model supplies a plot function, use it to plot the trajectory
    if isfield(model, 'plotfun'),
        figh(end+1) = feval(model.plotfun, hist);
    end;
    % close the figures if their target was not the screen
    if ~toscreen, close(figh); end;
        
end;        % end REPLAY if

if cfg.run || cfg.resume, 
    % save the data from this run
    % if default file name is used, generate a unique file name
    if strcmp(cfg.datafile, 'tqitdata'),
        c = fix(clock);
        for i = 1:length(c),
            cfg.datafile = [cfg.datafile '_' num2str(c(i), '%02d')];
        end;
        delete('tqitdata.mat');
    end;
    save(cfg.datafile);
    disp(['Tiling Q-iteration finished. Data was saved to [' cfg.datafile '].']);
end;


% set output
if cfg.run || cfg.resume,
    out = qistats;
elseif cfg.replay
    out = h;
end;

% END centrfzrl() ================================================================================


% Code to perform PF in qiteration - to verify whether for tiling theta = PF theta for any
% theta (which is by the way not true, as code below showed)
%                         ii = lin2ndi(i, X0DIM);
%                         TF(i, j) = tiling_get(xdata, theta, tdim, [X0(ii(1)); XDOT0(ii(2)); j]);
💿 文件大小 64 K
👤 上传用户 sky8997991
📂 所属分类数学计算
🏷️ 相关标签

#reinforcement #approximate #learning
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -