📄 myhmm.c

📁 一个马尔可夫模型的源码
💻 C
📖 第 1 页 / 共 3 页
字号:
    /* basic condition in forward algorithm */
    t=0;
    iObservation = getObservation( line[t] );
    for(i=0; i < N; i++)
    {
        alpha[t][i] = pi[i]*emissions[i][ iObservation ];
    }

    /* the induction step in forward algorithm */
    for(t=1; t < length; t++)
    {
        iObservation = getObservation( line[t] );
        for(i=0; i < N; i++)
        {
            sumTransProb = 0;
            for(j=0; j < N; j++)
            {
                sumTransProb = sumTransProb + alpha[t-1][j] * transitions[j][i];
            }

            alpha[t][i] = sumTransProb * emissions[i][iObservation];
        }
    }

    /* the termination step */
    t = length-1;
    prob = 0;
    for(i=0; i < N; i++)
    {
        prob = prob + alpha[t][i];
    }

    return prob;
}


/************************************************************************
NAME
     backward - backward algorithm

DESCRIPTION
     This function calculates the probability of the observation sequence
	 with backward algorithm given the HMM model.

     Input:
	   HMM model & the observation sequence
	 Output:
	   Probability
	 Global variables list:
	   N, beta, pi, transitions, emissions.
*************************************************************************/
double backward(char *line)
{
    int i, j, t;
    int length = strlen( line );
    /* the order of the real observation in the observation set */
    int iObservation;
    /* probability to state i given the sequence and the observation */
    double sumProbi;
    double prob;

    /* basic condition in backward algorithm */
    t = length-1;
    for(i=0; i < N; i++)
    {
        beta[t][i] = 1;
    }

    /* the induction step in backward algorithm */
    for(t=length-2; t >= 0; t--)
    {
        iObservation = getObservation( line[t+1] );
        for(i=0; i < N; i++)
        {
            sumProbi = 0;
            for(j=0; j < N; j++)
            {
                sumProbi = sumProbi + transitions[i][j]*emissions[j][iObservation] * beta[t+1][j];
            }

            beta[t][i] = sumProbi;
        }
    }

    /* the termination step */
    t = 0;
    iObservation = getObservation( line[t] );
    prob = 0;
    for(i=0; i < N; i++)
    {
        prob = prob + pi[i]*emissions[i][iObservation]*beta[t][i];
    }

    return prob;
}

/************************************************************************
NAME
     viterbi - viterbi algorithm

DESCRIPTION
     This function calculates the most probable state path of the
	 observation sequence with viterbi algorithm given the HMM model.

     Input:
	   HMM model & the observation sequence
	 Output:
	   the most probable state path
	 Global variables list:
	   N, beta, pi, transitions, emissions.
*************************************************************************/
void viterbi(char *line, char *buffer)
{
    /* delta_t0[ N ] */
    double* delta_t0;
    /* delta_t1[ N ] */
    double* delta_t1;
    /* the matrix to store the optimal states, used for backtracking
       size: N * length */
    int *phi;
    /* position in the matrix phi */
    int pos;
    int length = strlen( line );
    int i, j, t;
    int fromState, inState;
    int iObservation;
    double sum_for_scale;
    double maxProb, currentProb;

    /* allocate space for buffer */
    delta_t0 = (double*)malloc(N*sizeof(double));
    delta_t1 = (double*)malloc(N*sizeof(double));
    phi = (int*)malloc(length * N * sizeof( int ));

    /* the initial step */
    iObservation = getObservation( line[0] );
    for(i=0; i < N; i++)
    {
        delta_t0[i] = pi[i] * emissions[i][iObservation];
        phi[i] = 0;
    }

    /* recursion step */
    pos = N;
    for(t=1; t < length; t++)
    {
        iObservation = getObservation( line[t] );
        sum_for_scale = 0;
        for(j=0; j < N; j++)
        {
            /* find the optimal state from which transfered to state j */
            maxProb = delta_t0[0] * transitions[0][j];
            fromState = 0;
            for(i=1; i < N; i++)
            {
                currentProb = delta_t0[i] * transitions[i][j];
                if(maxProb < currentProb)
                {
                    maxProb = currentProb;
                    fromState = i;
                }
            }
            /* calculate the transition probability */
            delta_t1[j] = emissions[j][iObservation] * maxProb;
            sum_for_scale = sum_for_scale + delta_t1[j];
            phi[ pos++ ] = fromState;
        }

        /* update current probabilities */
        for(j=0; j < N; j++)
        {
            if(sum_for_scale < 0.1)
                delta_t0[j] = delta_t1[j] * 10;
            else
                delta_t0[j] = delta_t1[j];
        }
    }

    /* termination */
    maxProb = delta_t0[0];
    inState = 0;
    for(i=1; i < N; i++)
    {
        currentProb = delta_t0[i];
        if(maxProb < currentProb)
        {
            maxProb = currentProb;
            inState = i;
        }
    }

    /* path backtracking */
    i = inState;
    pos = length * N;
    for(t=length-1; t >= 0 ; t--)
    {
        buffer[ t ] = getSymbol( i );
        pos = pos - N;
        if(pos+i < 0)
        {
            printf("Error in the subscribe 1: pos = %d, statei = %d\n", pos, i);
            exit(1);
        }
        if(pos+i >= length * N)
        {
            printf("Error in the subscribe 2\n");
            exit(1);
        }
        i = phi[pos + i];
    }

    buffer[length] = '\0';
    free(delta_t0);
    free(delta_t1);
    free( phi );

    return;
}

/************************************************************************
NAME
     baumWelch - baumWelch algorithm

DESCRIPTION
     This function estimates the parameters of HMM with specified topology
	 given the training data and initial parameters of HMM

     Input:
	   HMM topology, HMM initial parameters, and training data
	 Output:
	   the estimated parameters
	 Global variables list:
	   iTrain, trainData, T, M, N, alpha, beta, pi, transitions, emissions.
*************************************************************************/
void baumWelch()
{
    int loops;
    int i, j, k, t;
    int iObservation;
    int length;
    double** _transitions; /* _transitions[ N ][ N ] estimated transition matrix T */
    double** _emissions;   /* _emissions[ N ][ M ]   estimated emission matrix E */
    double* _pi;           /* _pi[ N ]               estimated initial distribution */
    double oldLikelihood, Likelihood;
    double error;
    double sumGamma1;
    double* gamma1;  // gamma1[N]: for estimating initial distribution of the states
    double** Eij;  // Eij[N][N]: expected numbers transited from state i to state j
    double** Eia;  // Eia[N][M]: expected numbers of emitting a in state i
	/* X_d_t[N][N]: expected numbers transited from state i to state j
         when it is with observation O in the position t of d-th sequence */
    double** X_d_t;
    double* ei;    // ei[N]: expected numbers in state i
    double* ej;    // ej[N]: expected numbers of emissions in state j
    double Prob;
    double temp, temp2, tempM;

    /* allocate space */
    AllocateDataSpace( &_transitions, N, N );
    AllocateDataSpace( &_emissions, N, M );
    _pi = (double*)malloc(N*sizeof(double));
    gamma1 = (double*)malloc(N*sizeof(double));
    AllocateDataSpace( &Eij, N, N );
    AllocateDataSpace( &Eia, N, M );
    AllocateDataSpace( &X_d_t, N, N );
    ei = (double*)malloc(N*sizeof(double));
    ej = (double*)malloc(N*sizeof(double));

    /* initialization */
    error = 1;
    oldLikelihood = forward( trainData[0] );
    Likelihood = backward( trainData[0] );
    for(k=1; k < iTrain; k++)
    {
        oldLikelihood = oldLikelihood + forward(trainData[k]);
        Likelihood = Likelihood + backward(trainData[k]);
    }
    /* check whether forward and backward algorithms work well */
    if(fabs((Likelihood-oldLikelihood)/Likelihood) > 0.01)
    {
        printf("there are errors in forward or backward algorithm\n");
        exit(1);
    }
    Likelihood = oldLikelihood;
    printf("Likelihood from forward: %e\n", Likelihood);
    loops = 0;
    while((error > threshold && loops < 1000) && Likelihood > threshold)
    {
        printf("loops: %d\n", loops);
        loops++;
        /* initialize the expected matrix */
        for(i=0; i < N; i++)
        {
            gamma1[i] = 0;
            for(j=0; j < N; j++)
            {
                Eij[i][j] = 0;
            }
            for(j=0; j < M; j++)
            {
                Eia[i][j] = 0;
            }
        }

        // loop for all the sequences
        for(k=0; k < iTrain; k++)
        {
            // initialize the matrix alpha and beta
            Prob = backward(trainData[k]);
            Prob = forward(trainData[k]);
            length = strlen(trainData[k]);
            // loop for one sequence
            for(t=0; t < length-1; t++)
            {
                iObservation = getObservation(trainData[k][t+1]);
                for(i=0; i < N; i++)
                {
                    for(j=0; j < N; j++)
                    {
                        X_d_t[i][j] = (alpha[t][i] * transitions[i][j] * emissions[j][iObservation] * beta[t+1][j]) / Prob;
                        Eij[i][j] = Eij[i][j] + X_d_t[i][j];
                        Eia[j][iObservation] = Eia[j][iObservation] + X_d_t[i][j];
                        if(t == 0)
                        {
                            gamma1[i] = gamma1[i] + X_d_t[i][j];
                        }
                    }
                }
            }
        }
        sumGamma1 = 0;
        for(i=0; i < N; i++)
        {
            sumGamma1 = sumGamma1 + gamma1[i];
            ei[i] = Eij[i][0];
            for(j=1; j < N; j++)
            {
                ei[i] = ei[i] + Eij[i][j];
            }
            ej[i] = Eia[i][0];
            for(j=1; j < M; j++)
            {
                ej[i] = ej[i] + Eia[i][j];
            }
        }

        temp = 1.0 / (double)N;
        temp2= temp/ (double)N;
        tempM= 1.0 / (double)M;
        for(i=0; i < N; i++)
        {
            // estimate initial distribution
            if(sumGamma1 < threshold2)
                _pi[i] = temp; /* uniform distribution */
            else
                _pi[i] = gamma1[i] / sumGamma1;
            // estimate the transition matrix
            for(j=0; j < N; j++)
            {
                if(ei[i] < threshold2)
                {
                    if(i == j)
                        _transitions[i][j] = 1 - temp + temp2;
                    else
                    _transitions[i][j] = temp2;
                }
                else
                {
                    _transitions[i][j] = Eij[i][j] / ei[i];
                }
            }
            // estimate the emission matrix
            for(j=0; j < M; j++)
            {
                if(ej[i] < threshold2)
                    _emissions[i][j] = tempM;
                else
                    _emissions[i][j] = Eia[i][j] / ej[i];
            }
        }

        /* assign the estimated new values to the HMM model */
        for(i=0; i < N; i++)
        {
            /* do not need to update the initial state distribution */
            /*pi[i] = _pi[i];*/

            /* transition matrix */
            for(j=0; j < N; j++)
            {
                transitions[i][j] = _transitions[i][j];
            }

            /* emission matrix */
            for(j=0; j < M; j++)
            {
                emissions[i][j] = _emissions[i][j];
            }
        }
        /* compute the error */
        Likelihood = forward( trainData[0] );
        for(k=1; k < iTrain; k++)
        {
            Likelihood = Likelihood + forward(trainData[k]);
        }
        error = fabs(Likelihood - oldLikelihood);
        printf("oldLikelihood = %e, Likelihood = %e, error = %e\n", oldLikelihood, Likelihood, error);
        oldLikelihood = Likelihood;
    }
}

/************************************************************************
NAME
     loadHMM - load hidden Markov model from the specified file

DESCRIPTION
     This function ...

     Input:
	   the specified HMM file name
	 Output:
	   N, M, transitions, emissions, pi
	 Global variables list:
	   M, N, pi, transitions, emissions.
*************************************************************************/
void loadHMM(char* hmmFile)
{
    FILE *in_fp;
    char* token;
    int i, j, m;
    int end;
    int nLines = cal_lines(hmmFile);
    int nBuffer = getLengthOfLongestLine(hmmFile)+extraSpace;
    char *line, *buffer;
    /* Open for read (will fail if inputfile does not exist) */
    if( (in_fp  = fopen( hmmFile, "r" )) == NULL )
    {
        printf( "The file '%s' was not opened\n", hmmFile);
        exit(1);
    }
    line = (char *) malloc(nBuffer * sizeof(char));
    buffer = (char *) malloc(nBuffer * sizeof(char));

    // Part 1
    // read the first line
    fgets(line, nBuffer, in_fp);

    // number of states
    token = strtok( line, seps );
    N = atoi(token);
    
    /* exceptions for number of states */
    if(N < 1)
    {
        printf("the number of states is too small!\n");
        exit(1);
    }
    if(N > 100)
    {
        printf("the program can't deal with states more than 100!\n");
        exit(1);
    }
    /* number of observations */
    token = strtok( NULL, seps );
    M = atoi(token);
    observationsDefined = TRUE;
    if(M == noDefObservations)
    {
        observationsDefined = FALSE;
    }
    /* exceptions for number of observations */
    if((M <= 0 && M != noDefObservations) || M > 100)
    {
        printf("the number of the observations is not correct!\n");
        exit(1);
    }
    /* the possible length of the longest line */
    token = strtok( NULL, seps );
    T = atoi(token);

    // skip one line
    fgets(line, nBuffer, in_fp);

    // Part 2
    fgets(line, nBuffer, in_fp);
    /* whether to read the observations */
    if(observationsDefined == TRUE)
    {
        strcpy(buffer, line);
        m = 0;
        token = strtok( buffer, seps );
        // count how many observations there are in the line
        while(token != NULL)
        {
            token = strtok( NULL, seps );
            m++;
        }
        m = m - 1; // because there is a prefix in the sequence

        /* check whether the nunber of the observations is consistent */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -