📄 crf1m_learn_sgd.c
字号:
/* * Training linear-chain CRFs with Stochastic Gradient Descent (SGD). * * Copyright (c) 2007-2009, Naoaki Okazaki * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the names of the authors nor the names of its contributors * may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *//* $Id: crf1m_learn_sgd.c 159 2009-03-17 01:50:30Z naoaki $ *//* SGD for L2-regularized MAP estimation. The iterative algorithm is inspired by Pegasos: Shai Shalev-Shwartz, Yoram Singer, and Nathan Srebro. Pegasos: Primal Estimated sub-GrAdient SOlver for SVM. In Proc. of ICML 2007, pp 807-814, 2007. The calibration strategy is inspired by the implementation of sgd: http://leon.bottou.org/projects/sgd written by Léon Bottou. The objective function to minimize is: f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x) lambda = 2 * C / N The original version of the Pegasos algorithm. 0) Initialization t = t0 k = [the batch size] 1) Computing the learning rate (eta). eta = 1 / (lambda * t) 2) Updating feature weights. w = (1 - eta * lambda) w - (eta / k) \sum_i (oexp - mexp) 3) Projecting feature weights within an L2-ball. w = min{1, (1/sqrt(lambda))/||w||} * w 4) Goto 1 until convergence. A naive implementation requires O(K) computations for steps 2 and 3, where K is the total number of features. This code implements the procedure in an efficient way: 0) Initialization norm2 = 0 decay = 1 proj = 1 1) Computing various factors eta = 1 / (lambda * t) decay *= (1 - eta * lambda) scale = decay * proj gain = (eta / k) / scale 2) Updating feature weights Updating feature weights from observation expectation: delta = gain * (-1.0) * f(x,y) norm2 += delta * (delta + w + w); w += delta Updating feature weights from model expectation: delta = gain * P(y|x) * f(x,y) norm2 += delta * (delta + w + w); w += delta 3) Projecting feature weights within an L2-ball If 1.0 / lambda < norm2 * scale * scale: proj = 1.0 / (sqrt(norm2 * lambda) * scale) 4) Goto 1 until convergence.*/#ifdef HAVE_CONFIG_H#include <config.h>#endif/*HAVE_CONFIG_H*/#include <os.h>#include <float.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <time.h>#include <math.h>#include <crf.h>#include "logging.h"#include "params.h"#include "crf1m.h"#define MIN(a, b) ((a) < (b) ? (a) : (b))typedef struct { /** The square of the feature L2 norm. */ floatval_t norm2; /** Scaling factor for updating weights. */ floatval_t gain;} sgd_internal_t;#define SGD_INTERNAL(crf1ml) ((sgd_internal_t*)((crf1ml)->solver_data))inline void initialize_weights(crf1ml_t* crf1ml){ int i; floatval_t *w = crf1ml->w; sgd_internal_t *sgdi = SGD_INTERNAL(crf1ml); for (i = 0;i < crf1ml->num_features;++i) { w[i] = 0.; } sgdi->norm2 = 0;}inline voidupdate_weight( sgd_internal_t* sgdi, floatval_t* w, const int fid, floatval_t a ){ floatval_t w0 = w[fid]; w[fid] += a; sgdi->norm2 += a * (a + w0 + w0);}inline static void update_feature_weights( crf1ml_feature_t* f, const int fid, floatval_t prob, floatval_t scale, crf1ml_t* trainer, const crf_sequence_t* seq, int t ){ floatval_t *w = trainer->w; sgd_internal_t* sgdi = SGD_INTERNAL(trainer); /* Subtract the observation expectation from the weight. */ update_weight(sgdi, w, fid, -sgdi->gain * prob * scale); switch (f->type) { case FT_STATE: /**< State features. */ if (f->dst == seq->items[t].label) { update_weight(sgdi, w, fid, sgdi->gain * scale); } break; case FT_TRANS: /**< Transition features. */ if (f->src == seq->items[t].label && f->dst == seq->items[t+1].label) { update_weight(sgdi, w, fid, sgdi->gain * scale); } break; case FT_TRANS_BOS: /**< BOS transition features. */ if (f->dst == seq->items[t].label) { update_weight(sgdi, w, fid, sgdi->gain * scale); } break; case FT_TRANS_EOS: /**< EOS transition features. */ if (f->src == seq->items[t].label) { update_weight(sgdi, w, fid, sgdi->gain * scale); } break; }}static floatval_tcompute_loglikelihood( crf1ml_t* crf1mt, int *perm, const int N, floatval_t lambda ){ int i; crf_sequence_t *seq; floatval_t logp = 0., norm = 0.; const floatval_t* w = crf1mt->w; const int K = crf1mt->num_features; /* Set transition scores. */ crf1ml_transition_score(crf1mt, w, K, 1.); crf1mc_exp_transition(crf1mt->ctx); for (i = 0;i < N;++i) { seq = &crf1mt->seqs[perm[i]]; /* Set label sequences and state scores. */ crf1ml_set_labels(crf1mt, seq); crf1ml_state_score(crf1mt, seq, w, K, 1.); crf1mc_exp_state(crf1mt->ctx); /* Compute forward/backward scores. */ crf1mc_forward_score(crf1mt->ctx); crf1mc_backward_score(crf1mt->ctx); /* Compute the probability of the input sequence on the model. */ logp += crf1mc_logprob(crf1mt->ctx); } /* Compute the L2 norm of feature weights. */ for (i = 0;i < K;++i) { norm += w[i] * w[i]; } return logp - 0.5 * lambda * norm * N;}static int l2sgd( crf1ml_t* crf1mt, int *perm, const int N, const floatval_t t0, const floatval_t lambda, const int num_epochs, int calibration, int period, const floatval_t epsilon, floatval_t *ptr_logp ){ int i, epoch, ret = 0; floatval_t t = 0; floatval_t logp = 0; floatval_t best_logp = -DBL_MAX; clock_t clk_prev; crf_sequence_t *seq, *seqs = crf1mt->seqs; floatval_t* w = crf1mt->w; floatval_t* best_w = NULL; const int K = crf1mt->num_features; floatval_t eta, scale, boundary; floatval_t decay = 1., proj = 1.; sgd_internal_t* sgdi = SGD_INTERNAL(crf1mt); floatval_t improvement = 0.; floatval_t *pf = NULL; if (!calibration) { pf = (floatval_t*)malloc(sizeof(floatval_t) * period); best_w = (floatval_t*)malloc(sizeof(floatval_t) * K); for (i = 0;i < K;++i) { best_w[i] = 0.; } } /* Loop for epochs. */ for (epoch = 1;epoch <= num_epochs;++epoch) { if (!calibration) { logging(crf1mt->lg, "***** Epoch #%d *****\n", epoch); } clk_prev = clock(); if (!calibration) { /* Generate a permutation that shuffles the instances. */ crf1ml_shuffle(perm, N, 0); } /* Loop for instances. */ logp = 0.; for (i = 0;i < N;++i) { seq = &seqs[perm[i]]; /* Update various factors. */ eta = 1 / (lambda * (t0 + t)); decay *= (1.0 - eta * lambda); scale = decay * proj; sgdi->gain = eta / scale; /* Set transition scores. */ crf1ml_transition_score(crf1mt, w, K, scale); crf1mc_exp_transition(crf1mt->ctx);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -