📄 crf1m_learn_sgd.c
字号:
/* Set label sequences and state scores. */ crf1ml_set_labels(crf1mt, seq); crf1ml_state_score(crf1mt, seq, w, K, scale); crf1mc_exp_state(crf1mt->ctx); /* Compute forward/backward scores. */ crf1mc_forward_score(crf1mt->ctx); crf1mc_backward_score(crf1mt->ctx); /* Compute the probability of the input sequence on the model. */ logp += crf1mc_logprob(crf1mt->ctx); /* Update the feature weights. */ crf1ml_enum_features(crf1mt, seq, update_feature_weights); /* Project feature weights in the L2-ball. */ boundary = sgdi->norm2 * scale * scale * lambda; if (1. < boundary) { proj = 1.0 / sqrt(boundary); } ++t; } /* Terminate when the log probability is abnormal (NaN, -Inf, +Inf). */ if (!isfinite(logp)) { ret = CRFERR_OVERFLOW; break; } /* Include the L2 norm of feature weights to the objective. */ /* The factor N is necessary because lambda = 2 * C / N. */ logp -= 0.5 * lambda * sgdi->norm2 * scale * scale * N; /* Prevent the scale factor being too small. */ if (scale < 1e-20) { for (i = 0;i < K;++i) { w[i] *= scale; } decay = 1.; proj = 1.; } /* One epoch finished. */ if (!calibration) { /* Check if the current epoch is the best. */ if (best_logp < logp) { best_logp = logp; for (i = 0;i < K;++i) { best_w[i] = w[i]; } } /* We don't test the stopping criterion while period < epoch. */ if (period < epoch) { improvement = (pf[(epoch-1) % period] - logp) / logp; } else { improvement = epsilon; } /* Store the current value of the objective function. */ pf[(epoch-1) % period] = logp; logging(crf1mt->lg, "Log-likelihood: %f\n", logp); if (period < epoch) { logging(crf1mt->lg, "Improvement ratio: %f\n", improvement); } logging(crf1mt->lg, "Feature L2-norm: %f\n", sqrt(sgdi->norm2) * scale); logging(crf1mt->lg, "Learning rate (eta): %f\n", eta); logging(crf1mt->lg, "Total number of feature updates: %.0f\n", t); logging(crf1mt->lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC); /* Send the tagger with the current parameters. */ if (crf1mt->cbe_proc != NULL) { /* Callback notification with the tagger object. */ int ret = crf1mt->cbe_proc(crf1mt->cbe_instance, &crf1mt->tagger); } logging(crf1mt->lg, "\n"); /* Check for the stopping criterion. */ if (improvement < epsilon) { break; } } } /* Restore the best weights. */ if (best_w != NULL) { logp = best_logp; for (i = 0;i < K;++i) { w[i] = best_w[i]; } } free(best_w); free(pf); if (ptr_logp != NULL) { *ptr_logp = logp; } return ret;}static floatval_tl2sgd_calibration( crf1ml_t* crf1mt, const crf1ml_sgd_option_t* opt ){ int *perm = NULL; int dec = 0, ok, trials = 1; int num_candidates = opt->calibration_candidates; clock_t clk_begin = clock(); floatval_t logp; floatval_t init_logp = 0.; floatval_t best_logp = -DBL_MAX; floatval_t eta = opt->calibration_eta; floatval_t best_eta = opt->calibration_eta; floatval_t *w = crf1mt->w; const int N = crf1mt->num_sequences; const int M = MIN(N, opt->calibration_samples); const floatval_t init_eta = opt->calibration_eta; const floatval_t rate = opt->calibration_rate; const floatval_t lambda = opt->lambda; logging(crf1mt->lg, "Calibrating the learning rate (eta)\n"); logging(crf1mt->lg, "sgd.calibration.eta: %f\n", eta); logging(crf1mt->lg, "sgd.calibration.rate: %f\n", rate); logging(crf1mt->lg, "sgd.calibration.samples: %d\n", M); logging(crf1mt->lg, "sgd.calibration.candidates: %d\n", num_candidates); /* Initialize a permutation that shuffles the instances. */ perm = (int*)malloc(sizeof(int) * N); crf1ml_shuffle(perm, N, 1); /* Initialize feature weights as zero. */ initialize_weights(crf1mt); /* Compute the initial log likelihood. */ init_logp = compute_loglikelihood(crf1mt, perm, M, lambda); logging(crf1mt->lg, "Initial Log-likelihood: %f\n", init_logp); while (num_candidates > 0 || !dec) { logging(crf1mt->lg, "Trial #%d (eta = %f): ", trials, eta); /* Initialize feature weights as zero. */ initialize_weights(crf1mt); /* Perform SGD for one epoch. */ l2sgd(crf1mt, perm, M, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &logp); /* Make sure that the learning rate decreases the log-likelihood. */ ok = isfinite(logp) && (init_logp < logp); if (ok) { logging(crf1mt->lg, "%f\n", logp); } else { logging(crf1mt->lg, "%f (worse)\n", logp); } if (ok) { --num_candidates; if (best_logp < logp) { best_logp = logp; best_eta = eta; } } if (!dec) { if (ok) { eta *= rate; } else { dec = 1; eta = init_eta / rate; } } else { eta /= rate; } ++trials; } eta = best_eta; logging(crf1mt->lg, "Best learning rate (eta): %f\n", eta); logging(crf1mt->lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); logging(crf1mt->lg, "\n"); free(perm); return 1.0 / (lambda * eta);}int crf1ml_sgd_options(crf_params_t* params, crf1ml_option_t* opt, int mode){ crf1ml_sgd_option_t* sgd = &opt->sgd; BEGIN_PARAM_MAP(params, mode) DDX_PARAM_FLOAT( "regularization.sigma", sgd->sigma, 1., "" ) DDX_PARAM_INT( "sgd.max_iterations", sgd->max_iterations, 1000, "" ) DDX_PARAM_INT( "sgd.period", sgd->period, 10, "" ) DDX_PARAM_FLOAT( "sgd.delta", sgd->delta, 1e-6, "" ) DDX_PARAM_FLOAT( "sgd.calibration.eta", sgd->calibration_eta, 0.1, "" ) DDX_PARAM_FLOAT( "sgd.calibration.rate", sgd->calibration_rate, 2., "" ) DDX_PARAM_INT( "sgd.calibration.samples", sgd->calibration_samples, 1000, "" ) DDX_PARAM_INT( "sgd.calibration.candidates", sgd->calibration_candidates, 10, "" ) END_PARAM_MAP() return 0;}int crf1ml_sgd( crf1ml_t* crf1mt, crf1ml_option_t *opt ){ int ret = 0; int *perm = NULL; clock_t clk_begin; floatval_t logp = 0; const int N = crf1mt->num_sequences; const int K = crf1mt->num_features; crf1ml_sgd_option_t* sgdopt = &opt->sgd; sgd_internal_t sgd_internal; /* Set the solver-specific information. */ crf1mt->solver_data = &sgd_internal; sgdopt->lambda = 1.0 / (sgdopt->sigma * sgdopt->sigma * N); logging(crf1mt->lg, "Stochastic Gradient Descent (SGD)\n"); logging(crf1mt->lg, "regularization.sigma: %f\n", sgdopt->sigma); logging(crf1mt->lg, "sgd.max_iterations: %d\n", sgdopt->max_iterations); logging(crf1mt->lg, "sgd.period: %d\n", sgdopt->period); logging(crf1mt->lg, "sgd.delta: %f\n", sgdopt->delta); logging(crf1mt->lg, "\n"); clk_begin = clock(); /* Calibrate the training rate (eta). */ sgdopt->t0 = l2sgd_calibration(crf1mt, sgdopt); /* Initialize a permutation that shuffles the instances. */ perm = (int*)malloc(sizeof(int) * N); crf1ml_shuffle(perm, N, 1); /* Initialize feature weights as zero. */ initialize_weights(crf1mt); /* Perform stochastic gradient descent. */ ret = l2sgd( crf1mt, perm, N, sgdopt->t0, sgdopt->lambda, sgdopt->max_iterations, 0, sgdopt->period, sgdopt->delta, &logp ); logging(crf1mt->lg, "Log-likelihood: %f\n", logp); logging(crf1mt->lg, "Total seconds required for SGD: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); logging(crf1mt->lg, "\n"); free(perm); return ret;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -