📄 fuzzyqi.m

📁 approximate reinforcement learning
💻 M
字号:
function varargout = fuzzyqi(cfg)% Approximate offline, model-based Q-iteration with fuzzy approximation [1]%   VARARGOUT = FUZZYQI(CFG)% Inputs:%   CFG             - structure with fields as commented in the code below%           can also be given as a string, see he str2cfg% Outputs:%   VARARGOUT{1}	- either the Q-iteration statistics if the algorithm was run or resumed,%           or the control history if only a replay was performed%   VARARGOUT{2}	- output only in replay mode. Figure handles if figures were created %           and not closed; an empty matrix if all the figures were closed.%% [1] Busoniu, L.; Ernst, D.; De Schutter, B., and Babuska, R. %     "Fuzzy Approximation for Convergent Model-Based Reinforcement Learning"%     IEEE International Conference on Fuzzy Systems (FUZZ-IEEE-07), 2007 (submitted)if nargin < 1, cfg = struct(); end;% -----------------------------------------------% Process configuration structure% default config% script configCFG.run = 0;                        % run learningCFG.resume = 0;                     % resume learningCFG.replay = 0;                     % replay learned policyCFG.problem = '';                   % what problem to solveCFG.loadapprox = '';                % load approximator data from fileCFG.datafile = 'fzqidata';          % save data to file% learning configCFG.gamma = 0.98;                   % discount factorCFG.eps = .01;                      % threshold for convergenceCFG.maxiter = 300;                  % max number of iterations for Q-iterationCFG.serial = 1;                     % serial or parallel Q-iterationCFG.storeact = 1;                   % whether to store activation (membership) vectors (-1 for auto)% replay configCFG.interph = 1;                    % interpolated (averaged) policyCFG.x0 = [];                        % initial state for replay (otherwise the problem default or zeros)CFG.tend = 30;                      % end time for replay% stats & figure output configCFG.plottarget = 'screen';          % 'screen', '', 'latex', or 'beamer'. If 'screen' figures will not be closedCFG.savetheta = 0;                  % save param history in statsCFG.savedir = '';CFG.savefig = '';% display configCFG.verb = 5;                       % verbosity: the higher, the more detailed the messages displayedCFG.noplot = 0;                     % whether to suppress figure plotsCFG.silent = 0;                     % suppress all outputCFG.initstepdisp = .1;              % feedback every 10% of MDP initCFG.iterdisp = 10;                  % feedback after every 10 iterationsCFG.itersave = 25;                  % save after each 25 iterations% If caller provided string, parse it into a structureif ischar(cfg),    cfg = str2cfg(cfg, fieldnames(CFG));end;% Try installing problem defaults whenever caller does not specify values% Problem must be specified for this to worktry         if ~isempty(cfg.problem), cfg = checkparams(cfg, feval(cfg.problem, 'fuzzy')); end;catch   % do nothingend;% Install function defaults for everything elsecfg = checkparams(cfg, CFG);% Process configuration dependenciesif cfg.silent, cfg.verb = -Inf; cfg.noplot = 1; end;% Echo configdispx('Fuzzy Q-iteration called with the following configuration:', cfg.verb, 1);dispx(cfg, cfg.verb, 1);cfg.noinit = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');if cfg.noinit,        % load data file, making sure that cfg is not overwritten    cfg1 = cfg;    load(cfg.datafile);     cfg1.storeact = cfg.storeact;       % this must be the same as in the loaded file    cfg = cfg1;    clear cfg1;    dispx(['Current data loaded from [' cfg.datafile '].'], cfg.verb, 1);end;% -----------------------------------------------% Setup modelif ~cfg.noinit,    model = feval(cfg.problem, 'model');end;% load tiling, trans, etc. data if given a tiling data fileif cfg.loadapprox,    load(cfg.loadapprox, 'X', 'U', 'DIMS', 'XMFS', 'MDP');    dispx(['Initialization (approximator and sample data) loaded from [' cfg.loadapprox '].'], cfg.verb, 1);end;% -----------------------------------------------% Setup approximator structure (X mfs, dimensions, sample sets), if not loadedif ~cfg.noinit && isempty(cfg.loadapprox),    % ensure grids were supplied    checkparams(cfg, [], {'xgrids', 'ugrids'});    X = cfg.xgrids; U = cfg.ugrids;    DIMS.p = length(X); DIMS.q = length(U);     % # of states and outputs    DIMS.dimx = []; DIMS.dimu = [];             % dimensions of grids    XMFS = {};    for p = 1:DIMS.p,        XMFS{p} = gen_mfs(X{p});        DIMS.dimx(end+1) = length(X{p});    end;    for q = 1:DIMS.q,        DIMS.dimu(end+1) = length(U{q});    end;    % # of parameters    DIMS.N = prod(DIMS.dimx);    DIMS.M = prod(DIMS.dimu);end;% -----------------------------------------------% Q-iterationif cfg.run || cfg.resume,        % -----------------------------------------------    % MDP transition data, if not resuming and not given a transition data file    if ~cfg.resume && isempty(cfg.loadapprox),        % estimate the storage size required for membership degrees, assuming most next states        % activate 2^p membership functions        actstorage = DIMS.N * DIMS.M * 2^DIMS.p;        if cfg.storeact < 0,        % auto            % decide heuristically whether it's better to store mdegs            % the lhs of inequality is the number of mdegs which would be stored            cfg.storeact = actstorage < 50000;        end;        % override storeact when the variable size would be too large        if actstorage > 1e6,             cfg.storeact = 0;             dispx(['Disabling (overriding) membership storage, size too large:' num2str(actstorage)], cfg.verb, -1);        end;        if cfg.storeact,	dispx('Computing MDP and membership data...', cfg.verb, 0);        else                dispx('Computing MDP data...', cfg.verb, 0);        end;        t = cputime;        % init MDP structures        if cfg.storeact,  MDP.F = cell(DIMS.N, DIMS.M);        else              MDP.F = zeros(DIMS.N, DIMS.M, DIMS.p);        end;        MDP.R = zeros(DIMS.M, DIMS.N, 1);        % iterate over (xi, uj) \in (X0, U0) and populate MDP structures        x = zeros(DIMS.p, 1); u = zeros(DIMS.q, 1);        prog = cfg.initstepdisp;                   for i = 1:DIMS.N,            % compute linear indices and construct state             ii = lin2ndi(i, DIMS.dimx);            for p = 1:DIMS.p, x(p) = X{p}(ii(p)); end;            for j = 1:DIMS.M,                % compute linear indices and construct input                jj = lin2ndi(j, DIMS.dimu);                for q = 1:DIMS.q, u(q) = U{q}(jj(q)); end;                [xplus rplus] = feval(model.fun, model, x, u);                if cfg.storeact,                        % instead of xnext, compute & store membership degrees directly                    mu = mdegs(xplus(1), XMFS{1});                    for p = 2:DIMS.p,                        mu = mu(:) * mdegs(xplus(p), XMFS{p})';                    end;                    phi = find(mu);                    MDP.F{i, j} = [phi mu(phi)];                else                        MDP.F(i, j, :) = xplus;                end;                MDP.R(i, j) = rplus;            end;            % FOR over action discretization            if i/DIMS.N > prog,     % progress feedback                dispx([num2str(prog * 100) '% completed...'], cfg.verb, 2);                prog = prog + cfg.initstepdisp;            end;        end;                % FOR over state samples                % record how much time MDP data computation took (disregarding that progress display is        % also counted here)        qistats.tinit = cputime - t;        dispx('done.', cfg.verb, 2);        save(cfg.datafile);        dispx(['Initialization data saved to [' cfg.datafile ']'], cfg.verb, 1);    end;                    % IF need to initialize MDP       % -----------------------------------------------    % Do Q-iteration    dispx('Performing fuzzy Q-value iteration...', cfg.verb, 0);        % Q-iteration parameters on the config: cfg.gamma, cfg.eps, cfg.maxiter    % init param vector and iteration index when not resuming    if ~cfg.resume,        qistats.delta = [];        qistats.t = 0;        theta = zeros(DIMS.N, DIMS.M);        if cfg.savetheta,             qistats.theta = cell(cfg.maxiter+1, 1);     % also allow for theta_0            qistats.theta{1} = theta;                   % save theta_0 on the stats        end;        k = 1;    end;    delta = inf;    t = cputime;    while k <= cfg.maxiter && delta > cfg.eps,       % main loop                thetaold = theta;             % compute policy optimal in Q        % loop over (xi, uj) samples        for i = 1 : DIMS.N,            for j = 1 : DIMS.M,                if cfg.storeact,      % retrieve precomputed mdegs                    phi = MDP.F{i, j}(:, 1);                    mu = MDP.F{i, j}(:, 2);                else                    % retrieve x, compute mdegs here                    x = MDP.F(i, j, :);                    mu = mdegs(x(1), XMFS{1});                    for p = 2:DIMS.p, mu = mu(:) * mdegs(x(p), XMFS{p})'; end;                    phi = find(mu); mu = mu(phi);                end;                if cfg.serial,          % Gauss-Seidel, serial Q-iteration                     theta(i, j) = MDP.R(i, j) + cfg.gamma * max(mu' * theta(phi, :));                else                    % parallel Q-iteration                    theta(i, j) = MDP.R(i, j) + cfg.gamma * max(mu' * thetaold(phi, :));                end;            end;    % FOR uj        end;        % FOR xi                % compute infinity-norm of function (not matrix)        delta = max(max(abs(theta - thetaold)));                % update stats        qistats.t = qistats.t + (cputime - t);        qistats.delta(end + 1) = delta;        if cfg.savetheta, qistats.theta{k+1} = theta; end;     % save theta on stats                % visual feedback of algorithm progress        if ~mod(k, cfg.iterdisp),             dispx(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)], cfg.verb, 2);        end;        % data backup        if ~mod(k, cfg.itersave),            save(cfg.datafile);            dispx(['Intermediary data at k=' num2str(k) ' saved to [' cfg.datafile '].'], cfg.verb, 1);        end;                t = cputime;        k = k + 1;    end;        % while not converged and allowed more iterations    if delta < cfg.eps,	dispx('Convergence detected. Algorithm stopped.', cfg.verb, 0);    else                dispx(['maxiter=' num2str(cfg.maxiter) ' exhausted. Algorithm stopped'], cfg.verb, 0);    end;end;% -----------------------------------------------% Replayif cfg.replay,    % compute locally optimal policy (local optimal discrete action for each basis    % function/tile)    if cfg.interph        [Qstar ui] = max(theta, [], 2); clear Qstar;        ui = lin2ndi(ui, DIMS.dimu);        hstar = zeros(DIMS.N, DIMS.q);        for q = 1:DIMS.q,            hstar(:, q) = U{q}(ui(:, q));        end;    end;    % initial state    if ~isempty(cfg.x0),      % specified initial state        x0 = cfg.x0(:);    else                      % zeros        x0 = zeros(p, 1);    end;    dispx(['Controlling from x0=' num2str(reshape(x0, 1, [])) ], cfg.verb, 0);    % history    t = 0 : model.Ts : cfg.tend;    Ns = length(t)-1;       % number of samples / time instances at which control is applied    x = zeros(DIMS.p, length(t)); x(:, 1) = x0;    u = zeros(DIMS.q, length(t)); u(:, end) = NaN;    r = zeros(1, length(t)); r(1) = NaN;    % steps loop    for k = 1:Ns,        % compute mdegs of current state        mu = mdegs(x(1, k), XMFS{1});        for p = 2:DIMS.p, mu = mu(:) * mdegs(x(p, k), XMFS{p})'; end;        phi = find(mu); mu = mu(phi);                % compute optimal action        if cfg.interph,     % either interpolated            u(:, k) = (mu' * hstar(phi, :))';        else                % or crisp (ties broken randomly)            Qa = mu' * theta(phi, :);            ui = find(Qa == max(Qa)); ui = ui(ceil(rand * length(ui)));            ui = lin2ndi(ui, DIMS.dimu);            for q = 1:DIMS.q,                u(q, k) = U{q}(ui(:, q));            end;        end;                % apply to system        [x(:, k+1) r(k+1)] = feval(model.fun, model, x(:, k), u(:, k));    end;        % compute returns vector assuming the reward remains constant up to infinity    % discounting vector starting from t=0    discounting = cfg.gamma .^ (0:Ns); % 0:Ns-1 for the rewards; Ns for the infinity reward    % discounted rewards for t=0; append also the infinity return: r_inf*1/(1-gamma)    R = [r(2:Ns+1) r(end)/(1-cfg.gamma)] .* discounting;    % [gamma^0*R_0 gamma^1*R_1 ... gamma^K*R_K gamma^K+1*Rinf] (need to reverse to do the cumulative sum)    R = cumsum(R(end:-1:1));    % divide R_k by gamma^k for each k,to find out return starting from k    R = R(end:-1:1) ./ discounting;    % plot history & optionally save figures    hist.t = t; hist.x = x; hist.u = u; hist.r = r; hist.R = R;    if ~cfg.noplot,        toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget);        figh = plothistory(hist);        saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget);        % if the model supplies a plot function, use it to plot the trajectory        if isfield(model, 'plotfun'),            figh(end+1) = feval(model.plotfun, hist);        end;%             pause;        % close the figures if their target was not the screen        if ~toscreen, close(figh); end;    end;        end;        % IF replay% -----------------------------------------------% Backup dataif cfg.run || cfg.resume,     % save into the same directory as the problem    savedir = fileparts(which(cfg.problem));    % if default file name is used, generate a unique file name    if strcmp(cfg.datafile, CFG.datafile),        c = fix(clock);        for i = 1:length(c),            cfg.datafile = [cfg.datafile '-' num2str(c(i), '%02d')];        end;        delete([CFG.datafile '.mat']);    elseif ~strcmp(savedir, pwd)        delete([cfg.datafile '.mat']);   % need to save in a different directory, so delete anyway    end;    % otherwise just overwrite    cfg.datafile = [savedir '\' cfg.datafile];    save(cfg.datafile);    dispx(['Fuzzy Q-iteration finished. Data was saved to [' cfg.datafile '].'], cfg.verb, 1);end;% -----------------------------------------------% set outputif cfg.run || cfg.resume,       % output Q-iteration statistics    varargout = {qistats};elseif cfg.replay               % output history and possibly figure handles    if toscreen,    varargout = {hist, figh};    else            varargout = {hist, []};    end;end;% END fuzzyqi() RETURNING varargout =================================================================
💿 文件大小 64 K
👤 上传用户 sky8997991
📂 所属分类数学计算
🏷️ 相关标签

#reinforcement #approximate #learning
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -