📄 lda-estimate.c
字号:
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)// This file is part of LDA-C.// LDA-C is free software; you can redistribute it and/or modify it under// the terms of the GNU General Public License as published by the Free// Software Foundation; either version 2 of the License, or (at your// option) any later version.// LDA-C is distributed in the hope that it will be useful, but WITHOUT// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License// for more details.// You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307// USA#include "lda-estimate.h"/* * perform inference on a document and update sufficient statistics * */double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss){ double likelihood; int n, k; // posterior inference likelihood = lda_inference(doc, model, gamma, phi); // update sufficient statistics double gamma_sum = 0; for (k = 0; k < model->num_topics; k++) { gamma_sum += gamma[k]; ss->alpha_suffstats += digamma(gamma[k]); } ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); for (n = 0; n < doc->length; n++) { for (k = 0; k < model->num_topics; k++) { ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k]; ss->class_total[k] += doc->counts[n]*phi[n][k]; } } ss->num_docs = ss->num_docs + 1; return(likelihood);}/* * writes the word assignments line for a document to a file * */void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model){ int n; fprintf(f, "%03d", doc->length); for (n = 0; n < doc->length; n++) { fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics)); } fprintf(f, "\n"); fflush(f);}/* * saves the gamma parameters of the current dataset * */void save_gamma(char* filename, double** gamma, int num_docs, int num_topics){ FILE* fileptr; int d, k; fileptr = fopen(filename, "w"); for (d = 0; d < num_docs; d++) { fprintf(fileptr, "%5.10f", gamma[d][0]); for (k = 1; k < num_topics; k++) { fprintf(fileptr, " %5.10f", gamma[d][k]); } fprintf(fileptr, "\n"); } fclose(fileptr);}/* * run_em * */void run_em(char* start, char* directory, corpus* corpus){ int d, n; lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model char filename[100]; lda_suffstats* ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); lda_mle(model, ss, 0); model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); lda_mle(model, ss, 0); model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization int i = 0; double likelihood, likelihood_old = 0, converged = 1; sprintf(filename, "%s/likelihood.dat", directory); FILE* likelihood_file = fopen(filename, "w"); while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step lda_mle(model, ss, ESTIMATE_ALPHA); // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); fflush(likelihood_file); if ((i % LAG) == 0) { sprintf(filename,"%s/%03d",directory, i); save_lda_model(model, filename); sprintf(filename,"%s/%03d.gamma",directory, i); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); } } // output the final model sprintf(filename,"%s/final",directory); save_lda_model(model, filename); sprintf(filename,"%s/final.gamma",directory); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output the word assignments (for visualization) sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0) printf("final e step document %d\n",d); likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); fclose(likelihood_file);}/* * read settings. * */void read_settings(char* filename){ FILE* fileptr; char alpha_action[100]; fileptr = fopen(filename, "r"); fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER); fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED); fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER); fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED); fscanf(fileptr, "alpha %s", alpha_action); if (strcmp(alpha_action, "fixed")==0) { ESTIMATE_ALPHA = 0; } else { ESTIMATE_ALPHA = 1; } fclose(fileptr);}/* * inference only * */void infer(char* model_root, char* save, corpus* corpus){ FILE* fileptr; char filename[100]; int i, d, n; lda_model *model; double **var_gamma, likelihood, **phi; document* doc; model = load_lda_model(model_root); var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (i = 0; i < corpus->num_docs; i++) var_gamma[i] = malloc(sizeof(double)*model->num_topics); sprintf(filename, "%s-lda-lhood.dat", save); fileptr = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++) { if (((d % 100) == 0) && (d>0)) printf("document %d\n",d); doc = &(corpus->docs[d]); phi = (double**) malloc(sizeof(double*) * doc->length); for (n = 0; n < doc->length; n++) phi[n] = (double*) malloc(sizeof(double) * model->num_topics); likelihood = lda_inference(doc, model, var_gamma[d], phi); fprintf(fileptr, "%5.5f\n", likelihood); } fclose(fileptr); sprintf(filename, "%s-gamma.dat", save); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);}/* * update sufficient statistics * *//* * main * */int main(int argc, char* argv[]){ // (est / inf) alpha k settings data (random / seed/ model) (directory / out) corpus* corpus; long t1; (void) time(&t1); seedMT(t1); // seedMT(4357U); if (argc > 1) { if (strcmp(argv[1], "est")==0) { INITIAL_ALPHA = atof(argv[2]); NTOPICS = atoi(argv[3]); read_settings(argv[4]); corpus = read_data(argv[5]); make_directory(argv[7]); run_em(argv[6], argv[7], corpus); } if (strcmp(argv[1], "inf")==0) { read_settings(argv[2]); corpus = read_data(argv[4]); infer(argv[3], argv[5], corpus); } } else { printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n"); printf(" lda inf [settings] [model] [data] [name]\n"); } return(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -