📄 softmax.m
字号:
if ~isempty(mask) if ~isa(mask, 'double') | ~isreal(mask) | ndims(mask) ~= 2 | ... ~any(any(mask)) | ~all(nonzeros(mask) == 1) error('Mask must be a 2-d array of 0s and 1s.') end if isempty(weights) weights = mask; weights(find(mask)) = 1.4 * rand(nnz(mask), 1) - .7; normw = 0; elseif ndims(weights) ~= 2 | ~all(size(mask) == size(weights)) error('MASK and WEIGHTS must be same size.') end end if normw if ~isa(weights, 'double') | ~isreal(weights) | ... ndims(weights) ~= 2 | any(any(isnan(weights))) error('Weights must be a 2-d real array.') end if isempty(mask) mask = weights; mask(find(weights)) = 1; end % rescale weights because of input normalisation weights(1, p+2:end) = weights(1, p+2:end) + ... range(1,:) * weights(2:p+1, p+2:end); weights(2:p+1, p+2:end) = diag(diff(range)) ... * weights(2:p+1, p+2:end); end if decay % insert redundant output unit to balance weights for decay % parameter m = sum(weights(:, end-g+2:end), 2)/g; weights = [weights(:, 1:end-g+1), -m, ... [weights(:, end-g+2:end) - repmat(m, 1, g-1)]]; mask = [mask(:, 1:end-g+1), ... sparse(any(mask(:, end-g+2:end), 2)), ... mask(:, end-g+2:end)]; % sum of weights from each unit to all outputs now sum to 0 end weights(max(find(any(weights | mask, 2))) + 1:end, :) = []; mask(size(weights, 1)+1:end, :) = [];else if ~isempty(nhid) if ~isa(nhid, 'double') | length(nhid) ~= prod(size(nhid)) ... | ~isreal(nhid) | any(round(nhid) ~= nhid | ... isinf(nhid) | nhid < 0) error('NUNITS units must be a vector of positive, finite integers.') elseif length(nhid) > 1 if any(nhid <= 0) error(['Cannot specify layers with no units with more than one' ... ' hidden layer.']) end elseif nhid == 0 nhid = []; end end if isempty(skip) skip = 0; elseif skip if length(nhid) > 1 error(['May not specify skip weights with more than one hidden' ... ' layer.']) elseif nhid == 0 error('May not specify skip weights with no hidden layer.') end skip = 1; else skip = 0; end noutput = g - (decay == 0); nunits = [p nhid noutput]; nweights = sum((nunits(1:end-1) + 1) .* nunits(2:end)) + ... skip*p*noutput; idx = cumsum([2 nunits]); nlayer = length(nunits); nunits = sum(nunits)+1; weights = sparse(nunits - noutput, nunits, nweights); mask = weights; % connect bias to all hidden and output units. mask(1, p+2:end) = 1; if skip % connect input units to all output units. mask(2:p+1, end-noutput+1:end) = 1; end for i = 1:nlayer-1 % connect adjacent layers mask(idx(i):idx(i+1)-1, idx(i+1):idx(i+2)-1) = 1; end weights(find(mask)) = 1.4 * rand(nweights, 1) - .7;endninput = max(find(~any(mask | weights)));if any(~any(mask(:,p+2:end) | weights(:,p+2:end))) error(sprintf(['Unit %d has no input:\nInput units must have lowest' ... ' indeces.'], ... min(find(~any(mask(:,p+2:end) | ... weights(:,p+2:end))))+p+1)) elseif ninput < p+1 error('Not enough input units.')elseif ninput > p+1 error('Too many input units.')endnoutput = diff(size(mask));if any(~any(mask | weights, 2)) error(sprintf(['Unit %d has no ouput:\nOutput units must have' ... ' highest indeces.'], ... min(find(~any(mask | weights, 2)))))elseif noutput < g-(decay == 0) error('Not enough output untis.')elseif noutput > g-(decay == 0) error('Not enough input units.')endif any(any(tril(weights | mask))) error('All weights must join a lower to a higher indexed unit.')endif isempty(maxit) maxit = 200;elseif ~isa(maxit, 'double') | ~isreal(maxit) | length(maxit) ~= 1 ... | round(maxit) ~= maxit | maxit < 0 error('MAXITER must be a positive integer.')end% initial states[E post grad] = feedprop(X, G, w, weights, mask, decay);H = eye(nnz(mask)); % inverse of Hessianoldweights = weights;oldE = E; for iter = 1:maxit dir = -H*grad; % direction vector Ep = grad'*dir; % gradient from vector of partial derivitives (grad) lambda = [1 0]'; % length and old length lambdamin = 2*eps*max(abs(dir)./max(abs(oldweights(find(mask))), 1)); while 1 if lambda(1) < lambdamin weights = oldweights; break end % try point along dir weights(find(mask)) = oldweights(find(mask)) + lambda(1)*dir; E = feedprop(X, G, w, weights, mask, decay); if E <= oldE + 1e-4*Ep*lambda(1) break % good enough elseif lambda(1) == 1 lambda = [-Ep/(2*(E - oldE - Ep)); 1]; else ab = [1 -1; -lambda(2) lambda(1)] * diag(1./lambda.^2) * ... ([E; E2] - Ep*lambda - oldE) / diff(lambda); lambda(2) = lambda(1); if ab(1) == 0 if ab(2) == 0 break end lambda(1) = -Ep/(2*ab(2)); else labmda(1) = (-ab(2) + sqrt(ab(2)^2 - 3*ab(1)*Ep)) / ... (3* ab(1)); end end if ~isreal(lambda) lambda(1) = .1*lambda(2); else lambda(1) = max(min(lambda(1), .5*lambda(2)), .1*lambda(2)); end E2 = E; end if trace & ~rem(iter, 10) disp(sprintf('Iter: %d; Err: %g', iter, E)) end if oldE - E < 0 % indicates divergence (this appears to be normal % for large lambda warning('Error diverged.') weights = oldweights; E = oldE; break elseif oldE - E < E*n*eps % indicates convergence if trace disp('Error converged.') end break end grad1 = grad; [oldE post grad] = feedprop(X, G, w, weights, mask, decay); dir = weights(find(mask)) - oldweights(find(mask)); if max(dir./max(weights(find(mask)), 1)) < 4*eps if trace % convergence in weights but not error (probably too strict) disp('Gradient converged.') end break end oldweights = weights; dg = grad - grad1; pdg = dir'*dg; Hdg = H*dg; gHg = dg'*Hdg; u = dir/pdg - Hdg/gHg; H = H + dir*dir'/pdg - Hdg*Hdg'/gHg + gHg*u*u'; endif decay % get rid of redundant output by subtracting weights from other % output weights weights = [weights(:, 1:end-g), [weights(:, end-g+2:end) - ... repmat(weights(:, end-g+1), 1, g-1)]];end% rescale to actual non-normalised inputs so no one gets confused.weights(2:p+1, :) = spdiags(1./diff(range)', 0, p, p) ... * weights(2:p+1, :);weights(1, p+2:end) = weights(1, p+2:end) - ... range(1,:) * weights(2:p+1, p+2:end);f = class(struct('weights', weights), 'softmax', h);if nargout > 2 dev = 2*(E - decay*sum(weights(find(mask)))); if nargout > 3 hess = inv(H); endendfunction [E, post, grad] = feedprop(X, G, w, weights, mask, decay)%FEEDPROP Feed-forward and back-propogate.% [E, POST, GRAD] = FEEDPROP(X, G, W, WEIGHTS, MASK, DECAY)% returns the error E, the posterior probabilities in the n by g% matrix POST and the gradient vector of partial derivitives in% the NNZ(MASK) length vector GRAD.[n p] = size(X);g = size(G, 2);logG = G; % so we don't run into NaNslogG(find(G)) = log(G(find(G)));ninput = p + 1;noutput = g - (decay == 0);nunits = size(weights, 2);nhidden = nunits - noutput - ninput;% outputs from individual units including network inputsy = [ones(n, 1), X, zeros(n, nhidden + noutput)];for i = ninput+1:nunits % hidden and output units [idx, j, wgt] = find(weights(:, i)); nwgt = length(idx); if nwgt % it has happened! y(:, i) = sum(y(:, idx) * diag(wgt), 2); if i <= nunits - noutput % hidden units are logistic out = exp(y(:,i)); y(:, i) = out./(1+out); y(isnan(y(:,i)), i) = 1; % if out is very large end endend% calculate normalised posterior probabilities using SOFTMAX critereon% if no decay, first class is always zeropost = [zeros(n, decay == 0), y(:, end-noutput+1:end)];post = exp(post - repmat(max(post(:, 2:end), [], 2), 1, g));post = post ./ repmat(sum(post, 2), 1, g);if any(any(~post & G)) E = inf;else logpost = G; %zeros in G are not a problem. zeros in post mean %that weights are going off into infinity. logpost(find(G)) = log(post(find(G))); E = sum(w' * (G .* (logG - logpost) - G + post)) + ... decay*sum(weights(find(mask)).^2);end if nargout > 1 delta = [zeros(n, nhidden), ... post(:, 1 + (decay == 0):end) - G(:, 1 + (decay == 0):end)]; for i = nhidden:-1:1 [j idx wgt] = find(weights(i+ninput, :)); nwgt = length(idx); delta(:, i) = y(:, i+ninput) .* (1 - y(:, i+ninput)) .* ... sum(delta(:, idx-ninput) * spdiags(wgt', 0, nwgt, nwgt), 2); end [i j] = find(mask); grad = (y(:, i) .* delta(:, j-ninput))' * w ... - 2*decay*weights(find(mask));end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -