⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dpdriver.pas

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 PAS
字号:
(*
* DPDRIVER.PAS  -  A phrase-based decoder implemented using DP** Copyright (C) 2006 by Yidong Chen <ydchen@xmu.edu.cn>Institute of Artificial Intelligence, Xiamen University* Begin       : 09/18/2006* Last Change : 09/18/2006** This program is free software; you can redistribute it and/or* modify it under the terms of the GNU Lesser General Public* License as published by the Free Software Foundation; either* version 2.1 of the License, or (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU General Public License for more details.** You should have received a copy of the GNU Lesser General Public* License along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.*)
UNIT DPDRIVER;

INTERFACE

USES SysUtils, Windows, Classes, COMMON, CONFFILE;

FUNCTION Init(init_cfConfig: TConfFile;
              init_rtType: TResultType;
              init_stType: TSegType;
              init_ptType: TPhraseType): Real;
FUNCTION TranslateIt(strInput: STRING): STRING;
FUNCTION Translate863File(strSrcFile, strTgtFile: STRING): Real;
PROCEDURE CleanUp;

IMPLEMENTATION

USES Math, PERIOD, HASH, FSTRLIST, LM, BP, PREPROC, NEON;

TYPE
  TPAQuantity=^TAQuantity;
  TAQuantity=RECORD
    strE: STRING;
    rProb: Real;
    BackPointer: TPAQuantity;
  END;

VAR
  cfConfig: TConfFile;

  strlSents: TStringList;

  rtType: TResultType;
  ptType: TPhraseType;

FUNCTION Init(init_cfConfig: TConfFile;
              init_rtType: TResultType;
              init_stType: TSegType;
              init_ptType: TPhraseType): Real;
VAR pTimer: TPeriod;
BEGIN
  pTimer:=TPeriod.Create;
  pTimer.Start;

  cfConfig:=init_cfConfig;

  rtType:=init_rtType;
  ptType:=init_ptType;

  CASE rtType OF
    rtePurePb, rtePbNeonWord:
      BEGIN
        IF (rtType<>rtePurePb) THEN
          BEGIN
            Write('Loading Neon...');
            NEON.Init(cfConfig.CEMTPath, False);
            WriteLn(#13, 'Neon has been Loaded OK.');
          END;

        IF init_stType<>steNone THEN
          BEGIN
            Write('Loading PreProcessor...');
            PREPROC.Init(cfConfig, init_stType);
            WriteLn(#13, 'PreProcessor has been Loaded OK.');
          END;

        Write('Loading ', cfConfig.LanguageModel, '...');
        LM.Init(cfConfig.LMPath, cfConfig.LanguageModel);
        WriteLn(#13, cfConfig.LanguageModel, ' has been Loaded OK.');

        Write('Loading ', cfConfig.BilingualPhrases, '...');
        BP.Init(cfConfig.BilingualPhrases, ptType, rtType<>rtePurePb);
        WriteLn(#13, cfConfig.BilingualPhrases, ' has been Loaded OK.');

        WriteLn('Tp = ', FloatToStr(cfConfig.rTp));
        WriteLn('Np = ', cfConfig.iNp);
        WriteLn('*** parameters begin ***');
        WriteLn('p(c|e) = ', FloatToStr(cfConfig.rLPce));
        WriteLn('lex(c|e) = ', FloatToStr(cfConfig.rLLce));
        WriteLn('p(e|c) = ', FloatToStr(cfConfig.rLPec));
        WriteLn('lex(e|c) = ', FloatToStr(cfConfig.rLLec));
        WriteLn('word_penalty = ', FloatToStr(cfConfig.rLLenP));
        WriteLn('language_model = ', FloatToStr(cfConfig.rLLm));
        WriteLn('**** parameters end ****');

        strlSents:=TStringList.Create;
      END;
  END;

  pTimer.Stop;
  Result:=pTimer.TimeElapsed;
  pTimer.Free;
END;

FUNCTION GetSource(strlSentence: TStringList; iLow, iHigh: Integer): STRING;
VAR iLooper: Integer; strTemp: STRING;
BEGIN
  Result:='';
  Dec(iLow);
  Dec(iHigh);
  IF ptType=pteWithBlank THEN strTemp:=' ' ELSE strTemp:='';
  FOR iLooper:=iLow TO iHigh DO Result:=Result+strlSentence[iLooper]+strTemp;
  Result:=Trim(Result);
END;

FUNCTION FirstWord(strEP: STRING): STRING;
BEGIN
  Result:=ReadTrunc(strEP);
END;

FUNCTION SecondWord(strEP: STRING): STRING;
BEGIN
  ReadTrunc(strEP);
  Result:=ReadTrunc(strEP);
END;

FUNCTION LastWord(strEP: STRING): STRING;
BEGIN
  Result:=ReadTruncBack(strEP);
END;

FUNCTION LastButOneWord(strEP: STRING): STRING;
BEGIN
  ReadTruncBack(strEP);
  Result:=ReadTruncBack(strEP);
END;

FUNCTION TheComparer(Item1, Item2: Pointer): Integer;
VAR paqTemp1, paqTemp2: TPAQuantity;
BEGIN
  paqTemp1:=Item1; paqTemp2:=Item2;
  IF paqTemp2.rProb - paqTemp1.rProb>0 THEN Result:=1
  ELSE IF paqTemp2.rProb - paqTemp1.rProb=0 THEN Result:=0
  ELSE Result:=-1;
END;

PROCEDURE DoPruning(lstInput: TList; rQ: Real; iMaxCount: Integer);
VAR iLooper: Integer; paqTemp: TPAQuantity; rNowMax: Real; iUpper: Integer;
BEGIN
  IF (lstInput.Count=0) OR (iMaxCount<=0) OR (rQ<=0) THEN Exit;

  lstInput.Sort(@TheComparer);
  IF iMaxCount>0 THEN
    FOR iLooper:=lstInput.Count-1 DOWNTO iMaxCount DO
      BEGIN
        paqTemp:=lstInput[iLooper];
        Dispose(paqTemp);
        lstInput.Delete(iLooper);
      END;
  IF rQ>0 THEN
    BEGIN
      paqTemp:=lstInput[0];
      rNowMax:=paqTemp.rProb+Ln(rQ);
      IF iMaxCount>0 THEN iUpper:=Min(iMaxCount-1, lstInput.Count-1)
      ELSE iUpper:=lstInput.Count-1;
      FOR iLooper:=iUpper DOWNTO 1 DO
        BEGIN
          paqTemp:=lstInput[iLooper];
          IF rNowMax>paqTemp.rProb THEN
            BEGIN
              Dispose(paqTemp);
              lstInput.Delete(iLooper);
            END;
        END;
    END;
END;

FUNCTION TranslateSent(strInput: STRING): STRING;
VAR strlSentence: TStringList;
    Q: ARRAY OF Pointer;
    J: Integer;
    jLooper, iLooper, kLooper, mLooper: Integer;
    lstTemp: TList;
    rMax, rTemp: Real;
    paqMax, paqTemp, paqLooper: TPAQuantity;
    strF: STRING;
    lstEs: TList;
    ptTemp: TPTranslation;
    iTemp: Integer;
    strTemp: STRING;

    strFirstWord, strLastWord, strLastButOneWord, strSecondWord, strContext: STRING;
BEGIN
  Result:='';

  IF strInput='' THEN Exit;
  strlSentence:=TStringList.Create;
  WHILE strInput<>'' DO strlSentence.Add(ReadTrunc(strInput));
  J:=strlSentence.Count;

  SetLength(Q, J+2);
  FOR iLooper:=0 TO J+1 DO Q[iLooper]:=TList.Create;

  lstTemp:=Q[0];
  New(paqTemp);
  paqTemp.strE:=SOS;
  paqTemp.rProb:=0;
  paqTemp.BackPointer:=NIL;
  lstTemp.Add(paqTemp);

  lstEs:=TList.Create;

  FOR jLooper:=1 TO J DO
    BEGIN

      FOR iLooper:=0 TO jLooper-1 DO
        BEGIN

          lstTemp:=Q[iLooper];
          strF:=GetSource(strlSentence, iLooper+1, jLooper);
          BP.LookUp(strF, lstEs, iLooper+1=jLooper);
          IF lstEs.Count=0 THEN Continue;
          IF lstTemp.Count=0 THEN Continue;

          FOR kLooper:=0 TO lstEs.Count-1 DO
            BEGIN

              ptTemp:=lstEs[kLooper];

              paqMax:=lstTemp[0];

              strFirstWord:=FirstWord(ptTemp.strWord);
              strLastWord:=LastWord(paqMax.strE);
              strLastButOneWord:=LastButOneWord(paqMax.strE);
              strSecondWord:=SecondWord(ptTemp.strWord);

              IF strLastButOneWord='' THEN strContext:=strLastWord
              ELSE strContext:=strLastButOneWord+' '+strLastWord;

              rMax:=paqMax.rProb+
                    cfConfig.rLPce*ptTemp.rProbCE+
                    cfConfig.rLLce*ptTemp.rLexCE+
                    cfConfig.rLPec*ptTemp.rProbEC+
                    cfConfig.rLLec*ptTemp.rLexEC+
                    cfConfig.rLLm*LM.P(strFirstWord, strContext)+
                    cfConfig.rLLenP*ptTemp.rELen;

              IF strSecondWord<>'' THEN
                rMax:=rMax+cfConfig.rLLm*LM.P(strSecondWord, strLastWord+' '+strFirstWord);

              IF ptTemp.rLMScore<0 THEN ptTemp.rLMScore:=LM.PPhrase(ptTemp.strWord);
              rMax:=rMax+cfConfig.rLLm*ptTemp.rLMScore;

              FOR mLooper:=1 TO lstTemp.Count-1 DO
                BEGIN

                  paqTemp:=lstTemp[mLooper];

                  strLastWord:=LastWord(paqTemp.strE);
                  strLastButOneWord:=LastButOneWord(paqTemp.strE);

                  IF strLastButOneWord='' THEN strContext:=strLastWord
                  ELSE strContext:=strLastButOneWord+' '+strLastWord;

                  rTemp:=paqTemp.rProb+
                         cfConfig.rLPce*ptTemp.rProbCE+
                         cfConfig.rLLce*ptTemp.rLexCE+
                         cfConfig.rLPec*ptTemp.rProbEC+
                         cfConfig.rLLec*ptTemp.rLexEC+
                         cfConfig.rLLm*LM.P(strFirstWord, strContext)+
                         cfConfig.rLLenP*ptTemp.rELen;

                  IF strSecondWord<>'' THEN
                    rTemp:=rTemp+cfConfig.rLLm*LM.P(strSecondWord, strLastWord+' '+strFirstWord);

                  IF ptTemp.rLMScore<0 THEN ptTemp.rLMScore:=LM.PPhrase(ptTemp.strWord);
                  rTemp:=rTemp+cfConfig.rLLm*ptTemp.rLMScore;

                  IF rTemp>rMax THEN
                    BEGIN
                      rMax:=rTemp;
                      paqMax:=paqTemp;
                    END;
                END;

              New(paqTemp);
              paqTemp.strE:=ptTemp.strWord;
              paqTemp.rProb:=rMax;
              paqTemp.BackPointer:=paqMax;
              TList(Q[jLooper]).Add(paqTemp);
            END;

          lstEs.Clear;
        END;

      DoPruning(Q[jLooper], cfConfig.rTp, cfConfig.iNp);
    END;

  lstEs.Free;

  lstTemp:=Q[J];
  IF lstTemp.Count=0 THEN Exit; (*...*)
  paqMax:=lstTemp[0];
  rMax:=paqMax.rProb+
        cfConfig.rLLm*LM.P(EOS, LastWord(paqMax.strE));
  FOR mLooper:=1 TO lstTemp.Count-1 DO
    BEGIN
      paqTemp:=lstTemp[mLooper];
      rTemp:=paqTemp.rProb+
             cfConfig.rLLm*LM.P(EOS, LastWord(paqTemp.strE));
      IF rTemp>rMax THEN
        BEGIN
          rMax:=rTemp;
          paqMax:=paqTemp;
        END;
    END;
  New(paqTemp);
  paqTemp.strE:=EOS;
  paqTemp.rProb:=rMax;
  paqTemp.BackPointer:=paqMax;
  TList(Q[J+1]).Add(paqTemp);

  paqLooper:=paqMax;
  Result:='';
  WHILE paqLooper<>NIL DO
    BEGIN
      IF (paqLooper.BackPointer<>NIL) AND (paqLooper.BackPointer.BackPointer=NIL) THEN
        BEGIN
          strTemp:=paqLooper.strE;
          IF (strTemp<>'') AND (strTemp[1] IN ['a'..'z']) THEN
            BEGIN
              strTemp[1]:=UpCase(strTemp[1]);
              paqLooper.strE:=strTemp;
            END;
        END;
      IF paqLooper.BackPointer<>NIL THEN Result:=paqLooper.strE+' '+Result;
      paqLooper:=paqLooper.BackPointer;
    END;

  Result:=PostProcess(Result);

  FOR iLooper:=0 TO J+1 DO
    BEGIN
      lstTemp:=Q[iLooper];
      FOR jLooper:=lstTemp.Count-1 DOWNTO 0 DO
        BEGIN
          paqTemp:=lstTemp[jLooper];
          Dispose(paqTemp);
          lstTemp.Delete(jLooper);
        END;
      lstTemp.Free;
    END;

  strlSentence.Free;

  BP.ClearTempWords;
END;

PROCEDURE SplitSent(strInput: STRING; strlOutput: TStringList);
VAR strTemp, strSub: STRING;
BEGIN
  IF strlOutput=NIL THEN Exit;
  strlOutput.Clear;
  strSub:='';
  WHILE strInput<>'' DO
    BEGIN
      strTemp:=ReadTrunc(strInput);
      IF strSub='' THEN strSub:=strTemp ELSE strSub:=strSub+' '+strTemp;
      IF (strTemp='.') OR (strTemp='。') OR
         (strTemp='?') OR (strTemp='?') OR
         (strTemp='!') OR (strTemp='!') OR
         (strTemp=';') OR (strTemp=';') OR
         (strTemp='...') OR (strTemp='……') OR (strTemp='…') {OR
         (strTemp=',') OR (strTemp=',') OR
         (strTemp=':') OR (strTemp=':') OR
         (strTemp='-') OR (strTemp='--') OR (strTemp='————') OR (strTemp='——')} THEN
        BEGIN
          strlOutput.Add(strSub);
          strSub:='';
        END;
    END;
  IF strSub<>'' THEN strlOutput.Add(strSub);
END;

FUNCTION TranslateIt(strInput: STRING): STRING;
VAR iLooper: Integer;
BEGIN
  CASE rtType OF
    rtePurePb, rtePbNeonWord:
      BEGIN
        Result:='';

        strInput:=PREPROC.PreProcess(Trim(strInput));

        IF strInput='' THEN Exit;
        SplitSent(strInput, strlSents);

        FOR iLooper:=0 TO strlSents.Count-1 DO
          Result:=Result+TranslateSent(strlSents[iLooper]);

        Result:=Trim(Result);

      END;
  END;
END;

PROCEDURE SplitALine(strInput: STRING; VAR strST: STRING; VAR strContent: STRING; VAR strET: STRING);
BEGIN
  strST:='';
  strET:='';
  strContent:='';
  strST:=ReadTruncEx(strInput, '>'); strST:=strST+'>';
  strContent:=ReadTruncEx(strInput, '<');
  strET:='<'+strInput;
END;

FUNCTION ProcessA863Line(strInput: STRING): STRING;
VAR strTemp, strLine, strST, strET: STRING;
BEGIN
  strLine:=Trim(strInput);
  strTemp:=ReadTrunc(strLine);
  IF strTemp='<srcset' THEN Result:='<tstset '+strLine
  ELSE IF strTemp='<doc' THEN
    BEGIN
      strLine[Length(strLine)]:=' ';
      Result:=strTemp+' '+strLine+'lang="en" site="厦门大学">';
    END
  ELSE IF strTemp='<s' THEN
    BEGIN
      SplitALine(strTemp+' '+strLine, strST, strTemp, strET);
      Result:=strST+TranslateIt(strTemp)+strET;
    END
  ELSE IF strTemp='</srcset>' THEN Result:='</tstset>'
  ELSE Result:=strTemp+' '+strLine;
END;

FUNCTION Translate863File(strSrcFile, strTgtFile: STRING): Real;
VAR pTimer: TPeriod;
    fstrlInput: TFStringList;
    iLooper: Integer;
BEGIN
  pTimer:=TPeriod.Create;
  pTimer.Start;

  fstrlInput:=TFStringList.Create;
  fstrlInput.LoadXML(strSrcFile);

  WriteLn('Totally ', fstrlInput.Count, ' lines...');
  FOR iLooper:=0 TO fstrlInput.Count-1 DO
    BEGIN
    Write(#13, 'Line', iLooper+1);
    fstrlInput[iLooper]:=ProcessA863Line(fstrlInput[iLooper]);
    END;
  WriteLn;

  fstrlInput.Save(strTgtFile);
  fstrlInput.Free;

  pTimer.Stop;
  Result:=pTimer.TimeElapsed;
  pTimer.Free;
END;

PROCEDURE CleanUp;
BEGIN
  strlSents.Free;
  IF rtType<>rteNone THEN BP.CleanUp;
  IF rtType<>rteNone THEN LM.CleanUp;
  PREPROC.CleanUp;
  IF (rtType<>rtePurePb) THEN NEON.CleanUp;
END;

END.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -