📄 fuzzyqi.m
字号:
function varargout = fuzzyqi(cfg)% Approximate offline, model-based Q-iteration with fuzzy approximation [1]% VARARGOUT = FUZZYQI(CFG)% Inputs:% CFG - structure with fields as commented in the code below% can also be given as a string, see he str2cfg% Outputs:% VARARGOUT{1} - either the Q-iteration statistics if the algorithm was run or resumed,% or the control history if only a replay was performed% VARARGOUT{2} - output only in replay mode. Figure handles if figures were created % and not closed; an empty matrix if all the figures were closed.%% [1] Busoniu, L.; Ernst, D.; De Schutter, B., and Babuska, R. % "Fuzzy Approximation for Convergent Model-Based Reinforcement Learning"% IEEE International Conference on Fuzzy Systems (FUZZ-IEEE-07), 2007 (submitted)if nargin < 1, cfg = struct(); end;% -----------------------------------------------% Process configuration structure% default config% script configCFG.run = 0; % run learningCFG.resume = 0; % resume learningCFG.replay = 0; % replay learned policyCFG.problem = ''; % what problem to solveCFG.loadapprox = ''; % load approximator data from fileCFG.datafile = 'fzqidata'; % save data to file% learning configCFG.gamma = 0.98; % discount factorCFG.eps = .01; % threshold for convergenceCFG.maxiter = 300; % max number of iterations for Q-iterationCFG.serial = 1; % serial or parallel Q-iterationCFG.storeact = 1; % whether to store activation (membership) vectors (-1 for auto)% replay configCFG.interph = 1; % interpolated (averaged) policyCFG.x0 = []; % initial state for replay (otherwise the problem default or zeros)CFG.tend = 30; % end time for replay% stats & figure output configCFG.plottarget = 'screen'; % 'screen', '', 'latex', or 'beamer'. If 'screen' figures will not be closedCFG.savetheta = 0; % save param history in statsCFG.savedir = '';CFG.savefig = '';% display configCFG.verb = 5; % verbosity: the higher, the more detailed the messages displayedCFG.noplot = 0; % whether to suppress figure plotsCFG.silent = 0; % suppress all outputCFG.initstepdisp = .1; % feedback every 10% of MDP initCFG.iterdisp = 10; % feedback after every 10 iterationsCFG.itersave = 25; % save after each 25 iterations% If caller provided string, parse it into a structureif ischar(cfg), cfg = str2cfg(cfg, fieldnames(CFG));end;% Try installing problem defaults whenever caller does not specify values% Problem must be specified for this to worktry if ~isempty(cfg.problem), cfg = checkparams(cfg, feval(cfg.problem, 'fuzzy')); end;catch % do nothingend;% Install function defaults for everything elsecfg = checkparams(cfg, CFG);% Process configuration dependenciesif cfg.silent, cfg.verb = -Inf; cfg.noplot = 1; end;% Echo configdispx('Fuzzy Q-iteration called with the following configuration:', cfg.verb, 1);dispx(cfg, cfg.verb, 1);cfg.noinit = (cfg.resume || cfg.replay) && exist([cfg.datafile '.mat'], 'file');if cfg.noinit, % load data file, making sure that cfg is not overwritten cfg1 = cfg; load(cfg.datafile); cfg1.storeact = cfg.storeact; % this must be the same as in the loaded file cfg = cfg1; clear cfg1; dispx(['Current data loaded from [' cfg.datafile '].'], cfg.verb, 1);end;% -----------------------------------------------% Setup modelif ~cfg.noinit, model = feval(cfg.problem, 'model');end;% load tiling, trans, etc. data if given a tiling data fileif cfg.loadapprox, load(cfg.loadapprox, 'X', 'U', 'DIMS', 'XMFS', 'MDP'); dispx(['Initialization (approximator and sample data) loaded from [' cfg.loadapprox '].'], cfg.verb, 1);end;% -----------------------------------------------% Setup approximator structure (X mfs, dimensions, sample sets), if not loadedif ~cfg.noinit && isempty(cfg.loadapprox), % ensure grids were supplied checkparams(cfg, [], {'xgrids', 'ugrids'}); X = cfg.xgrids; U = cfg.ugrids; DIMS.p = length(X); DIMS.q = length(U); % # of states and outputs DIMS.dimx = []; DIMS.dimu = []; % dimensions of grids XMFS = {}; for p = 1:DIMS.p, XMFS{p} = gen_mfs(X{p}); DIMS.dimx(end+1) = length(X{p}); end; for q = 1:DIMS.q, DIMS.dimu(end+1) = length(U{q}); end; % # of parameters DIMS.N = prod(DIMS.dimx); DIMS.M = prod(DIMS.dimu);end;% -----------------------------------------------% Q-iterationif cfg.run || cfg.resume, % ----------------------------------------------- % MDP transition data, if not resuming and not given a transition data file if ~cfg.resume && isempty(cfg.loadapprox), % estimate the storage size required for membership degrees, assuming most next states % activate 2^p membership functions actstorage = DIMS.N * DIMS.M * 2^DIMS.p; if cfg.storeact < 0, % auto % decide heuristically whether it's better to store mdegs % the lhs of inequality is the number of mdegs which would be stored cfg.storeact = actstorage < 50000; end; % override storeact when the variable size would be too large if actstorage > 1e6, cfg.storeact = 0; dispx(['Disabling (overriding) membership storage, size too large:' num2str(actstorage)], cfg.verb, -1); end; if cfg.storeact, dispx('Computing MDP and membership data...', cfg.verb, 0); else dispx('Computing MDP data...', cfg.verb, 0); end; t = cputime; % init MDP structures if cfg.storeact, MDP.F = cell(DIMS.N, DIMS.M); else MDP.F = zeros(DIMS.N, DIMS.M, DIMS.p); end; MDP.R = zeros(DIMS.M, DIMS.N, 1); % iterate over (xi, uj) \in (X0, U0) and populate MDP structures x = zeros(DIMS.p, 1); u = zeros(DIMS.q, 1); prog = cfg.initstepdisp; for i = 1:DIMS.N, % compute linear indices and construct state ii = lin2ndi(i, DIMS.dimx); for p = 1:DIMS.p, x(p) = X{p}(ii(p)); end; for j = 1:DIMS.M, % compute linear indices and construct input jj = lin2ndi(j, DIMS.dimu); for q = 1:DIMS.q, u(q) = U{q}(jj(q)); end; [xplus rplus] = feval(model.fun, model, x, u); if cfg.storeact, % instead of xnext, compute & store membership degrees directly mu = mdegs(xplus(1), XMFS{1}); for p = 2:DIMS.p, mu = mu(:) * mdegs(xplus(p), XMFS{p})'; end; phi = find(mu); MDP.F{i, j} = [phi mu(phi)]; else MDP.F(i, j, :) = xplus; end; MDP.R(i, j) = rplus; end; % FOR over action discretization if i/DIMS.N > prog, % progress feedback dispx([num2str(prog * 100) '% completed...'], cfg.verb, 2); prog = prog + cfg.initstepdisp; end; end; % FOR over state samples % record how much time MDP data computation took (disregarding that progress display is % also counted here) qistats.tinit = cputime - t; dispx('done.', cfg.verb, 2); save(cfg.datafile); dispx(['Initialization data saved to [' cfg.datafile ']'], cfg.verb, 1); end; % IF need to initialize MDP % ----------------------------------------------- % Do Q-iteration dispx('Performing fuzzy Q-value iteration...', cfg.verb, 0); % Q-iteration parameters on the config: cfg.gamma, cfg.eps, cfg.maxiter % init param vector and iteration index when not resuming if ~cfg.resume, qistats.delta = []; qistats.t = 0; theta = zeros(DIMS.N, DIMS.M); if cfg.savetheta, qistats.theta = cell(cfg.maxiter+1, 1); % also allow for theta_0 qistats.theta{1} = theta; % save theta_0 on the stats end; k = 1; end; delta = inf; t = cputime; while k <= cfg.maxiter && delta > cfg.eps, % main loop thetaold = theta; % compute policy optimal in Q % loop over (xi, uj) samples for i = 1 : DIMS.N, for j = 1 : DIMS.M, if cfg.storeact, % retrieve precomputed mdegs phi = MDP.F{i, j}(:, 1); mu = MDP.F{i, j}(:, 2); else % retrieve x, compute mdegs here x = MDP.F(i, j, :); mu = mdegs(x(1), XMFS{1}); for p = 2:DIMS.p, mu = mu(:) * mdegs(x(p), XMFS{p})'; end; phi = find(mu); mu = mu(phi); end; if cfg.serial, % Gauss-Seidel, serial Q-iteration theta(i, j) = MDP.R(i, j) + cfg.gamma * max(mu' * theta(phi, :)); else % parallel Q-iteration theta(i, j) = MDP.R(i, j) + cfg.gamma * max(mu' * thetaold(phi, :)); end; end; % FOR uj end; % FOR xi % compute infinity-norm of function (not matrix) delta = max(max(abs(theta - thetaold))); % update stats qistats.t = qistats.t + (cputime - t); qistats.delta(end + 1) = delta; if cfg.savetheta, qistats.theta{k+1} = theta; end; % save theta on stats % visual feedback of algorithm progress if ~mod(k, cfg.iterdisp), dispx(['k=' num2str(k) ' iteration completed, delta=' num2str(delta)], cfg.verb, 2); end; % data backup if ~mod(k, cfg.itersave), save(cfg.datafile); dispx(['Intermediary data at k=' num2str(k) ' saved to [' cfg.datafile '].'], cfg.verb, 1); end; t = cputime; k = k + 1; end; % while not converged and allowed more iterations if delta < cfg.eps, dispx('Convergence detected. Algorithm stopped.', cfg.verb, 0); else dispx(['maxiter=' num2str(cfg.maxiter) ' exhausted. Algorithm stopped'], cfg.verb, 0); end;end;% -----------------------------------------------% Replayif cfg.replay, % compute locally optimal policy (local optimal discrete action for each basis % function/tile) if cfg.interph [Qstar ui] = max(theta, [], 2); clear Qstar; ui = lin2ndi(ui, DIMS.dimu); hstar = zeros(DIMS.N, DIMS.q); for q = 1:DIMS.q, hstar(:, q) = U{q}(ui(:, q)); end; end; % initial state if ~isempty(cfg.x0), % specified initial state x0 = cfg.x0(:); else % zeros x0 = zeros(p, 1); end; dispx(['Controlling from x0=' num2str(reshape(x0, 1, [])) ], cfg.verb, 0); % history t = 0 : model.Ts : cfg.tend; Ns = length(t)-1; % number of samples / time instances at which control is applied x = zeros(DIMS.p, length(t)); x(:, 1) = x0; u = zeros(DIMS.q, length(t)); u(:, end) = NaN; r = zeros(1, length(t)); r(1) = NaN; % steps loop for k = 1:Ns, % compute mdegs of current state mu = mdegs(x(1, k), XMFS{1}); for p = 2:DIMS.p, mu = mu(:) * mdegs(x(p, k), XMFS{p})'; end; phi = find(mu); mu = mu(phi); % compute optimal action if cfg.interph, % either interpolated u(:, k) = (mu' * hstar(phi, :))'; else % or crisp (ties broken randomly) Qa = mu' * theta(phi, :); ui = find(Qa == max(Qa)); ui = ui(ceil(rand * length(ui))); ui = lin2ndi(ui, DIMS.dimu); for q = 1:DIMS.q, u(q, k) = U{q}(ui(:, q)); end; end; % apply to system [x(:, k+1) r(k+1)] = feval(model.fun, model, x(:, k), u(:, k)); end; % compute returns vector assuming the reward remains constant up to infinity % discounting vector starting from t=0 discounting = cfg.gamma .^ (0:Ns); % 0:Ns-1 for the rewards; Ns for the infinity reward % discounted rewards for t=0; append also the infinity return: r_inf*1/(1-gamma) R = [r(2:Ns+1) r(end)/(1-cfg.gamma)] .* discounting; % [gamma^0*R_0 gamma^1*R_1 ... gamma^K*R_K gamma^K+1*Rinf] (need to reverse to do the cumulative sum) R = cumsum(R(end:-1:1)); % divide R_k by gamma^k for each k,to find out return starting from k R = R(end:-1:1) ./ discounting; % plot history & optionally save figures hist.t = t; hist.x = x; hist.u = u; hist.r = r; hist.R = R; if ~cfg.noplot, toscreen = strcmp(cfg.plottarget, 'screen') || isempty(cfg.plottarget); figh = plothistory(hist); saveplot(figh, [cfg.savedir cfg.savefig], cfg.plottarget); % if the model supplies a plot function, use it to plot the trajectory if isfield(model, 'plotfun'), figh(end+1) = feval(model.plotfun, hist); end;% pause; % close the figures if their target was not the screen if ~toscreen, close(figh); end; end; end; % IF replay% -----------------------------------------------% Backup dataif cfg.run || cfg.resume, % save into the same directory as the problem savedir = fileparts(which(cfg.problem)); % if default file name is used, generate a unique file name if strcmp(cfg.datafile, CFG.datafile), c = fix(clock); for i = 1:length(c), cfg.datafile = [cfg.datafile '-' num2str(c(i), '%02d')]; end; delete([CFG.datafile '.mat']); elseif ~strcmp(savedir, pwd) delete([cfg.datafile '.mat']); % need to save in a different directory, so delete anyway end; % otherwise just overwrite cfg.datafile = [savedir '\' cfg.datafile]; save(cfg.datafile); dispx(['Fuzzy Q-iteration finished. Data was saved to [' cfg.datafile '].'], cfg.verb, 1);end;% -----------------------------------------------% set outputif cfg.run || cfg.resume, % output Q-iteration statistics varargout = {qistats};elseif cfg.replay % output history and possibly figure handles if toscreen, varargout = {hist, figh}; else varargout = {hist, []}; end;end;% END fuzzyqi() RETURNING varargout =================================================================
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -