tilingqi.m

来自「approximate reinforcement learning」· M 代码 · 共 369 行

M
369
字号
function out = tilingqi(cfg)
% Approximate offline, model-based Q-iteration with tile-coding approximation
%   OUT = TILINGQI(CFG)
% Inputs:
%   CFG     - structure with fields as commented in the code below
%           can also be given as a string, see help str2cfg
% Outputs:
%   OUT     - either the Q-iteration statistics if the algorithm was run or resumed,
%           or the control history if only a replay was performed
%


if nargin < 1, cfg = struct(); end;

% -----------------------------------------------
% Process configuration

% default config
% script config
CFG.run = 0;                        % run learning
CFG.resume = 0;                     % resume learning
CFG.replay = 0;                     % replay learned policy
CFG.problem = '';                   % what problem to solve
CFG.loadapprox = '';                % load approximator data from file
CFG.datafile = 'tqidata';           % save data to file
% learning config
CFG.gamma = 0.98;                   % discount factor
CFG.eps = .1;                       % threshold for convergence
CFG.maxiter = 300;                  % max number of iterations for phi-iteration
CFG.serial = 1;                     % serial or parallel Q-iteration
CFG.storeact = -1;                  % whether to store activation (membership) vectors (-1 for auto)
% replay config
CFG.interph = 1;                    % interpolated (averaged) policy
CFG.x0 = [];                        % initial state for replay (empty for default)
CFG.tend = 30;                      % end time for replay
% stats & figure output config
CFG.plottarget = 'screen';          % 'screen', '', 'latex', or 'beamer'
CFG.savetheta = 0;                  % save param history in stats NOT IMPLEMENTED
CFG.savedir = '';
CFG.savefig = '';
% display config
CFG.verb = 5;                       % verbosity: the higher, the more detailed the messages displayed
CFG.noplot = 0;                     % whether to suppress figure plots
CFG.silent = 0;                     % suppress all output
CFG.initstepdisp = .1;              % feedback every 10% of MDP init
CFG.iterdisp = 10;                  % feedback after every 10 iteration
CFG.itersave = 25;                  % save after every 25 iterations

% If caller provided string, parse it into a structure
if ischar(cfg),
    cfg = str2cfg(cfg, fieldnames(CFG));
end;
% Try installing problem defaults whenever caller does not specify values
% Problem must be specified for this to work
try     
    if ~isempty(cfg.problem), cfg = checkparams(cfg, feval(cfg.problem, 'tiling')); end;
catch   % do nothing
end;
% Install function defaults for everything else
cfg = checkparams(cfg, CFG);
% Process configuration dependencies
if cfg.silent, cfg.verb = -Inf; cfg.noplot = 1; end;
% Echo config
dispx('Tile-coding Q-iteration called with the following configuration:', cfg.verb, 1);
dispx(cfg, cfg.verb, 1);

cfg.noinit = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');

if cfg.noinit,        % load data file, making sure that cfg is not overwritten
    cfg1 = cfg;
    load(cfg.datafile);
    cfg = cfg1; clear cfg1;
end;

% -----------------------------------------------
% Setup model
if ~cfg.noinit,
    model = feval(cfg.problem, 'model');
end;

% load tiling, trans, etc. data if given a tiling data file
if ~isempty(cfg.loadapprox),
    load(cfg.loadapprox, 'X', 'X0', 'U', 'DIMS', 'XDATA', 'TDIM', 'MDP', 'P', 'tcfg');
    dispx(['Initialization (tiling, transition, and projection data) loaded from [' cfg.loadapprox '].'], cfg.verb, 1);
end;


% -----------------------------------------------
% Setup tiling and command discretization, if not loaded
if ~cfg.noinit && isempty(cfg.loadapprox),
    checkparams(cfg, [], {'xgrids', 'ugrids', 'tcfg'});
    X = cfg.xgrids; U = cfg.ugrids; tcfg = cfg.tcfg;

    DIMS.p = length(X); DIMS.q = length(U);   % # of states and outputs
    DIMS.dimx = []; DIMS.dimu = [];           % dimensions of grids
    for p = 1:DIMS.p, DIMS.dimx(end+1) = length(X{p}); end;
    for q = 1:DIMS.q, DIMS.dimu(end+1) = length(U{q}); end;
    
    % create tiling
    [XDATA, theta, TDIM] = tiling_create(X, DIMS.dimu, tcfg.c, tcfg);
    clear tdata;

    % # of parameters
    DIMS.N = TDIM.c * prod(DIMS.dimx);      % number of tilings x number of grid hypercubes
    DIMS.M = prod(DIMS.dimu);
    
    % choose sample points as center points of all hypercubes formed by intersections
    % of all tiling surfaces
    X0 = cell(DIMS.p, 1);
    DIMS.dimx0 = [];
    for p = 1:DIMS.p, 
        xp = sort(XDATA{p}(:));
        xp = xp(~isnan(xp)); xp = [xp(diff(xp) ~= 0); xp(end)]; % remove duplicates and NaNs
        X0{p} = xp(1:end-1) + diff(xp)/2;                       % pick center of every interval
        DIMS.dimx0(end+1) = length(X0{p});
    end;
    DIMS.N0 = prod(DIMS.dimx0);         % total number of state samples
end;


% -----------------------------------------------
% Q-iteration
if cfg.run || cfg.resume,
    
    % -----------------------------------------------
    % MDP transition data, if not resuming and not given a transition data file
    if ~cfg.resume && isempty(cfg.loadapprox),

        if cfg.storeact < 0,        % auto
            % decide heuristically whether to store activations of next states:
            % if number of elements to store < 5000, or the number of elements is
            % at most 5 times greater a plain transition table would require
            cfg.storeact = ((DIMS.N0 * DIMS.M * TDIM.c)) < 5000 || (TDIM.c <= 5*DIMS.p);
        end;
        if cfg.storeact,    dispx('Computing MDP, sample, and next state activation data...', cfg.verb, 0);
        else                dispx('Computing MDP and sample activation data...', cfg.verb, 0);
        end;
        t = cputime;
        
        if cfg.storeact,    MDP.F = zeros(DIMS.N0, DIMS.M, TDIM.c);
        else                MDP.F = zeros(DIMS.N0, DIMS.M, DIMS.p);
        end;

        % init MDP structure and activation matrix
        
        MDP.R = zeros(DIMS.N0, DIMS.M);
        PHI = sparse(DIMS.N, DIMS.N0);

        % iterate over (xi, uj) \in (X0, U0) and populate MDP structures
        x = zeros(DIMS.p, 1); u = zeros(DIMS.q, 1);
        prog = cfg.initstepdisp;           
        for i = 1:DIMS.N0,
            % compute linear indices and construct state 
            ii = lin2ndi(i, DIMS.dimx0);
            for p = 1:DIMS.p, x(p) = X0{p}(ii(p)); end;

            % record activated tiles in activation matrix (only retrieve for first discrete
            % action as we don't care about the discrete components)
            [tv ti] = tiling_get(XDATA, theta, TDIM, [x; 1]);
            PHI(ti, i) = 1/TDIM.c;
            
            % process next state for each action
            for j = 1:DIMS.M,
                % compute linear indices and construct input
                jj = lin2ndi(j, DIMS.dimu);
                for q = 1:DIMS.q, u(q) = U{q}(jj(q)); end;

                [xplus, rplus] = feval(model.fun, model, x, u);
                % store either activated tiles for xplus, or xplus itself
                if cfg.storeact,
                    [tv MDP.F(i, j, :)] = tiling_get(XDATA, theta, TDIM, [xplus; 1]);            
                else
                    MDP.F(i, j, :) = xplus;
                end;
                MDP.R(i, j) = rplus;
            end;

            if i/DIMS.N0 > prog,     % progress feedback
                dispx([num2str(prog * 100) '% completed...'], cfg.verb, 2);
                prog = prog + cfg.initstepdisp;
            end;
        end;                % FOR over state samples

        dispx('done.', cfg.verb, 2);
        clear tv;       % unneeded variable

        % compute projection matrix from activation matrix PHI
        dispx('Computing projection matrix', cfg.verb, 1);
        nrm = sum(PHI, 2); P = sparse(DIMS.N, DIMS.N0);
        for i = 1:DIMS.N, P(i, :) = PHI(i, :) / nrm(i); end;
        % activation matrix no longer needed
        clear PHI;
        
        % record how much time MDP data computation took (disregarding that progress display is
        % also counted here)
        qistats.tinit = cputime - t;

        save(cfg.datafile);
        dispx(['Initialization data saved to [' cfg.datafile ']'], cfg.verb, 1);
        
    end;                    % IF initialize MDP etc.
   
    % -----------------------------------------------
    % Do Q-iteration
    dispx('Performing tiling Q-value iteration...', cfg.verb, 0);
    
    % Q-iteration parameters: cfg.eps, cfg.maxiter, cfg.gamma
    % init param vector and iteration index when not resuming
    if ~cfg.resume,
        qistats.delta = [];
        qistats.t = 0;
        dispx('Initializing approximator parameters', cfg.verb, 2);
        theta = tiling_reset(TDIM, tcfg);
        k = 1; 
    end;
    delta = inf;
    t = cputime;
    TF = zeros(DIMS.N0, DIMS.M);      % records the rhs of Bellman operator for all samples
    while k <= cfg.maxiter && delta > cfg.eps,       % main loop
        
        % compute the rhs of approximate Bellman operator, 
        % [T(F(theta))](xi, uj) for all samples xi, uj
        if cfg.storeact,        % use the stored indices of the activated tiles
            for i = 1:DIMS.N0, 
                for j = 1:DIMS.M,
                    TF(i, j) = MDP.R(i, j) + cfg.gamma * max( mean(theta(MDP.F(i, j, :), :)) );
                end; 
            end;
        else            
            for i = 1:DIMS.N0, 
                for j = 1:DIMS.M,
                    TF(i, j) = MDP.R(i, j) + cfg.gamma * ...
                        max( tiling_get(XDATA, theta, TDIM, MDP.F(i, j, :)) );
                end; 
            end;
        end;
                
        % update weight data
        thetaold = theta;
        theta = P * TF;     % project TF into theta using P
        delta = max(max(abs(theta - thetaold)));        

        % update stats
        qistats.t = qistats.t + (cputime - t);
        qistats.delta(end + 1) = delta;
        
        % visual feedback of algorithm progress
        if ~mod(k, cfg.iterdisp), 
            dispx(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)], cfg.verb, 2);
        end;
        % data backup
        if ~mod(k, cfg.itersave),
            save(cfg.datafile);
            dispx(['Intermediary data at k=' num2str(k) ' saved to [' cfg.datafile '].'], cfg.verb, 1);
        end;
        
        t = cputime;
        k = k + 1;
    end;        % while not converged
    
    % remove unneeded large arrays
    clear TF;

    if delta < cfg.eps,	dispx('Convergence detected. Algorithm stopped.', cfg.verb, 0);
    else                dispx(['maxiter=' num2str(cfg.maxiter) ' exhausted. Algorithm stopped'], cfg.verb, 0);
    end;
end;


% -----------------------------------------------
% Replay
if cfg.replay,

    % compute locally optimal policy (local optimal discrete action for each basis
    % function/tile)
    if cfg.interph
        [Qstar ui] = max(theta, [], 2); clear Qstar;
        ui = lin2ndi(ui, DIMS.dimu);
        hstar = zeros(DIMS.N, DIMS.q);
        for q = 1:DIMS.q,
            hstar(:, q) = U{q}(ui(:, q));
        end;
    end;
        
    % initial state
    if ~isempty(cfg.x0),      % specified initial state
        x0 = cfg.x0(:);
    else                      % zeros
        x0 = zeros(p, 1);
    end;
    dispx(['Controlling from x0=' num2str(reshape(x0, 1, [])) ], cfg.verb, 1);


    % history
    t = 0 : model.Ts : cfg.tend;
    x = zeros(DIMS.p, length(t)); x(:, 1) = x0;
    u = zeros(DIMS.q, length(t)); u(:, end) = NaN;
    r = zeros(1, length(t)); r(1) = NaN;

    % steps loop
    for k = 1:length(t)-1,
        % compute optimal action
        [Qstar istar] = tiling_get(XDATA, theta, TDIM, x(:, k));
        if cfg.interph,     % average between local maxima corresponding to each tile
            u(:, k) = mean(hstar(istar, :));
        else                % crisp selection using the approximate Q-values of all u in U
            ui = find(Qstar == max(Qstar)); ui = ui(ceil(rand * length(ui)));
            ui = lin2ndi(ui, DIMS.dimu);
            for q = 1:DIMS.q,
                u(q, k) = U{q}(ui(:, q));
            end;
        end;
        
        % apply to system
        [x(:, k+1) r(k+1)] = feval(model.fun, model, x(:, k), u(:, k));
    end;

    % plot history & optionally save figures
    hist.t = t; hist.x = x; hist.u = u; hist.r = r;
    if ~cfg.noplot,
        toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget);
        figh = plothistory(hist);
        saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget);
        % if the model supplies a plot function, use it to plot the trajectory
        if isfield(model, 'plotfun'),
            figh(end+1) = feval(model.plotfun, hist);
        end;
%             pause;
        % close the figures if their target was not the screen
        if ~toscreen, close(figh); end;
    end;

        
end;        % IF replay


% -----------------------------------------------
% Backup data

if cfg.run || cfg.resume, 
    % save into the same directory as the problem
    savedir = fileparts(which(cfg.problem));
    % if default file name is used, generate a unique file name
    if strcmp(cfg.datafile, CFG.datafile),
        c = fix(clock);
        for i = 1:length(c),
            cfg.datafile = [cfg.datafile '-' num2str(c(i), '%02d')];
        end;
        delete([CFG.datafile '.mat']);
    elseif ~strcmp(savedir, pwd)
        delete([cfg.datafile '.mat']);   % need to save in a different directory, so delete anyway
    end;    % otherwise just overwrite
    cfg.datafile = [savedir '\' cfg.datafile];
    save(cfg.datafile);
    dispx(['Tile-coding Q-iteration finished. Data was saved to [' cfg.datafile '].'], cfg.verb, 1);
end;


% -----------------------------------------------
% set output
if cfg.run || cfg.resume,
    out = qistats;
elseif cfg.replay       % output history
    out = hist;
end;


% END tilingqi() RETURNING out =================================================================

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?