📄 crf1m_learn_sgd.c

📁 CRFsuite is a very fast implmentation of the Conditional Random Fields (CRF) algorithm. It handles t
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
            /* Set label sequences and state scores. */            crf1ml_set_labels(crf1mt, seq);            crf1ml_state_score(crf1mt, seq, w, K, scale);            crf1mc_exp_state(crf1mt->ctx);            /* Compute forward/backward scores. */            crf1mc_forward_score(crf1mt->ctx);            crf1mc_backward_score(crf1mt->ctx);            /* Compute the probability of the input sequence on the model. */            logp += crf1mc_logprob(crf1mt->ctx);            /* Update the feature weights. */            crf1ml_enum_features(crf1mt, seq, update_feature_weights);            /* Project feature weights in the L2-ball. */            boundary = sgdi->norm2 * scale * scale * lambda;            if (1. < boundary) {                proj = 1.0 / sqrt(boundary);            }            ++t;        }        /* Terminate when the log probability is abnormal (NaN, -Inf, +Inf). */        if (!isfinite(logp)) {            ret = CRFERR_OVERFLOW;            break;        }        /* Include the L2 norm of feature weights to the objective. */        /* The factor N is necessary because lambda = 2 * C / N. */        logp -= 0.5 * lambda * sgdi->norm2 * scale * scale * N;        /* Prevent the scale factor being too small. */        if (scale < 1e-20) {            for (i = 0;i < K;++i) {                w[i] *= scale;            }            decay = 1.;            proj = 1.;        }        /* One epoch finished. */        if (!calibration) {            /* Check if the current epoch is the best. */            if (best_logp < logp) {                best_logp = logp;                for (i = 0;i < K;++i) {                    best_w[i] = w[i];                }            }            /* We don't test the stopping criterion while period < epoch. */            if (period < epoch) {                improvement = (pf[(epoch-1) % period] - logp) / logp;            } else {                improvement = epsilon;            }            /* Store the current value of the objective function. */            pf[(epoch-1) % period] = logp;            logging(crf1mt->lg, "Log-likelihood: %f\n", logp);            if (period < epoch) {                logging(crf1mt->lg, "Improvement ratio: %f\n", improvement);            }            logging(crf1mt->lg, "Feature L2-norm: %f\n", sqrt(sgdi->norm2) * scale);            logging(crf1mt->lg, "Learning rate (eta): %f\n", eta);            logging(crf1mt->lg, "Total number of feature updates: %.0f\n", t);            logging(crf1mt->lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC);            /* Send the tagger with the current parameters. */            if (crf1mt->cbe_proc != NULL) {                /* Callback notification with the tagger object. */                int ret = crf1mt->cbe_proc(crf1mt->cbe_instance, &crf1mt->tagger);            }            logging(crf1mt->lg, "\n");            /* Check for the stopping criterion. */            if (improvement < epsilon) {                break;            }        }    }    /* Restore the best weights. */    if (best_w != NULL) {        logp = best_logp;        for (i = 0;i < K;++i) {            w[i] = best_w[i];        }    }    free(best_w);    free(pf);    if (ptr_logp != NULL) {        *ptr_logp = logp;    }    return ret;}static floatval_tl2sgd_calibration(    crf1ml_t* crf1mt,    const crf1ml_sgd_option_t* opt    ){    int *perm = NULL;    int dec = 0, ok, trials = 1;    int num_candidates = opt->calibration_candidates;    clock_t clk_begin = clock();    floatval_t logp;    floatval_t init_logp = 0.;    floatval_t best_logp = -DBL_MAX;    floatval_t eta = opt->calibration_eta;    floatval_t best_eta = opt->calibration_eta;    floatval_t *w = crf1mt->w;    const int N = crf1mt->num_sequences;    const int M = MIN(N, opt->calibration_samples);    const floatval_t init_eta = opt->calibration_eta;    const floatval_t rate = opt->calibration_rate;    const floatval_t lambda = opt->lambda;    logging(crf1mt->lg, "Calibrating the learning rate (eta)\n");    logging(crf1mt->lg, "sgd.calibration.eta: %f\n", eta);    logging(crf1mt->lg, "sgd.calibration.rate: %f\n", rate);    logging(crf1mt->lg, "sgd.calibration.samples: %d\n", M);    logging(crf1mt->lg, "sgd.calibration.candidates: %d\n", num_candidates);    /* Initialize a permutation that shuffles the instances. */    perm = (int*)malloc(sizeof(int) * N);    crf1ml_shuffle(perm, N, 1);    /* Initialize feature weights as zero. */    initialize_weights(crf1mt);    /* Compute the initial log likelihood. */    init_logp = compute_loglikelihood(crf1mt, perm, M, lambda);    logging(crf1mt->lg, "Initial Log-likelihood: %f\n", init_logp);    while (num_candidates > 0 || !dec) {        logging(crf1mt->lg, "Trial #%d (eta = %f): ", trials, eta);        /* Initialize feature weights as zero. */        initialize_weights(crf1mt);        /* Perform SGD for one epoch. */        l2sgd(crf1mt, perm, M, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &logp);        /* Make sure that the learning rate decreases the log-likelihood. */        ok = isfinite(logp) && (init_logp < logp);        if (ok) {            logging(crf1mt->lg, "%f\n", logp);        } else {            logging(crf1mt->lg, "%f (worse)\n", logp);        }        if (ok) {            --num_candidates;            if (best_logp < logp) {                best_logp = logp;                best_eta = eta;            }        }        if (!dec) {            if (ok) {                eta *= rate;            } else {                dec = 1;                eta = init_eta / rate;            }        } else {            eta /= rate;        }        ++trials;    }    eta = best_eta;    logging(crf1mt->lg, "Best learning rate (eta): %f\n", eta);    logging(crf1mt->lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);    logging(crf1mt->lg, "\n");    free(perm);    return 1.0 / (lambda * eta);}int crf1ml_sgd_options(crf_params_t* params, crf1ml_option_t* opt, int mode){    crf1ml_sgd_option_t* sgd = &opt->sgd;    BEGIN_PARAM_MAP(params, mode)        DDX_PARAM_FLOAT(            "regularization.sigma", sgd->sigma, 1.,            ""            )        DDX_PARAM_INT(            "sgd.max_iterations", sgd->max_iterations, 1000,            ""            )        DDX_PARAM_INT(            "sgd.period", sgd->period, 10,            ""            )        DDX_PARAM_FLOAT(            "sgd.delta", sgd->delta, 1e-6,            ""            )        DDX_PARAM_FLOAT(            "sgd.calibration.eta", sgd->calibration_eta, 0.1,            ""            )        DDX_PARAM_FLOAT(            "sgd.calibration.rate", sgd->calibration_rate, 2.,            ""            )        DDX_PARAM_INT(            "sgd.calibration.samples", sgd->calibration_samples, 1000,            ""            )        DDX_PARAM_INT(            "sgd.calibration.candidates", sgd->calibration_candidates, 10,            ""            )    END_PARAM_MAP()    return 0;}int crf1ml_sgd(    crf1ml_t* crf1mt,    crf1ml_option_t *opt    ){    int ret = 0;    int *perm = NULL;    clock_t clk_begin;    floatval_t logp = 0;    const int N = crf1mt->num_sequences;    const int K = crf1mt->num_features;    crf1ml_sgd_option_t* sgdopt = &opt->sgd;    sgd_internal_t sgd_internal;    /* Set the solver-specific information. */    crf1mt->solver_data = &sgd_internal;    sgdopt->lambda = 1.0 / (sgdopt->sigma * sgdopt->sigma * N);    logging(crf1mt->lg, "Stochastic Gradient Descent (SGD)\n");    logging(crf1mt->lg, "regularization.sigma: %f\n", sgdopt->sigma);    logging(crf1mt->lg, "sgd.max_iterations: %d\n", sgdopt->max_iterations);    logging(crf1mt->lg, "sgd.period: %d\n", sgdopt->period);    logging(crf1mt->lg, "sgd.delta: %f\n", sgdopt->delta);    logging(crf1mt->lg, "\n");    clk_begin = clock();    /* Calibrate the training rate (eta). */    sgdopt->t0 = l2sgd_calibration(crf1mt, sgdopt);    /* Initialize a permutation that shuffles the instances. */    perm = (int*)malloc(sizeof(int) * N);    crf1ml_shuffle(perm, N, 1);    /* Initialize feature weights as zero. */    initialize_weights(crf1mt);    /* Perform stochastic gradient descent. */    ret = l2sgd(        crf1mt,        perm,        N,        sgdopt->t0,        sgdopt->lambda,        sgdopt->max_iterations,        0,        sgdopt->period,        sgdopt->delta,        &logp        );    logging(crf1mt->lg, "Log-likelihood: %f\n", logp);    logging(crf1mt->lg, "Total seconds required for SGD: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);    logging(crf1mt->lg, "\n");    free(perm);    return ret;}
上一页 12
💿 文件大小 436 K
👤 上传用户 bobey
📂 所属分类多国语言处理
🏷️ 相关标签

#implmentation #Conditional #algorithm #CRFsuite
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -