score.cpp

来自「moses开源的机器翻译系统」· C++ 代码 · 共 397 行

CPP
397
字号
// $Id: score.cpp 1470 2007-10-02 21:43:54Z redpony $
// vim:tabstop=2

#include <sstream>
#include <cstdio>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include "AlignmentPhrase.h"
#include "tables-core.h"

using namespace std;

#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
#define LINE_MAX_LENGTH 10000

class PhraseAlignment {
public:
  int english, foreign;
  vector< vector<size_t> > alignedToE;
  vector< vector<size_t> > alignedToF;
  
  void create( char*, int );
  void clear();
  bool equals( const PhraseAlignment& );
};

class LexicalTable {
public:
  map< WORD_ID, map< WORD_ID, double > > ltable;
  void load( char[] );
};

vector<string> tokenize( char [] );

void processPhrasePairs( vector< PhraseAlignment > & );

ofstream phraseTableFile;

Vocabulary vcbE;
Vocabulary vcbF;
LexicalTable lexTable;
PhraseTable phraseTableE;
PhraseTable phraseTableF;
bool inverseFlag;

int main(int argc, char* argv[]) 
{
  cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n"
       << "phrase scoring methods for extracted phrases\n";
  time_t starttime = time(NULL);

  if (argc != 4 && argc != 5) {
    cerr << "syntax: phrase-score extract lex phrase-table [inverse]\n";
    exit(1);
  }
  char* &fileNameExtract = argv[1];
  char* &fileNameLex = argv[2];
  char* &fileNamePhraseTable = argv[3];
  inverseFlag = false;
  if (argc > 4) {
    inverseFlag = true;
    cerr << "using inverse mode\n";
  }
  //  char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted";
  //  string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n";
  //  string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n";

  // lexical translation table
  lexTable.load( fileNameLex );
  
  // sorted phrase extraction file
  ifstream extractFile;

  extractFile.open(fileNameExtract);
  if (extractFile.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
    exit(1);
  }
  istream &extractFileP = extractFile;

  // output file: phrase translation table
  phraseTableFile.open(fileNamePhraseTable);
  if (phraseTableFile.fail()) {
    cerr << "ERROR: could not open file phrase table file " 
	 << fileNamePhraseTable << endl;
    exit(1);
  }
  
  // loop through all extracted phrase translations
  int lastForeign = -1;
  vector< PhraseAlignment > phrasePairsWithSameF;
  int i=0;
  int fileCount = 0;
  while(true) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;
    char line[LINE_MAX_LENGTH];    
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n');
    //    if (fileCount>0)
    if (extractFileP.eof()) break;
    PhraseAlignment phrasePair;
    phrasePair.create( line, i );
    if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
      processPhrasePairs( phrasePairsWithSameF );
      for(int j=0;j<phrasePairsWithSameF.size();j++)
	phrasePairsWithSameF[j].clear();
      phrasePairsWithSameF.clear();
      phraseTableE.clear();
      phraseTableF.clear();
      phrasePair.clear(); // process line again, since phrase tables flushed
      phrasePair.create( line, i ); 
    }
    lastForeign = phrasePair.foreign;
    phrasePairsWithSameF.push_back( phrasePair );
  }
  processPhrasePairs( phrasePairsWithSameF );
  phraseTableFile.close();
}

void outputAlignment(const AlignmentPhrase &alignmentPhrase)
{
	for (size_t posWord = 0 ; posWord < alignmentPhrase.GetSize() ; ++posWord)
	{
		stringstream strme("");
		const AlignmentElement &alignmentElement = alignmentPhrase.GetElement(posWord);
		AlignmentElement::const_iterator iterElement;
		for (iterElement = alignmentElement.begin() ; iterElement != alignmentElement.end() ; ++iterElement)
		{
			size_t align = *iterElement;
			strme << "," << align;
		}
		string str = strme.str();
		if (str.size() > 0)
			str = str.substr(1, str.size() - 1);
		phraseTableFile << "(" << str << ") ";
	}

	phraseTableFile << "||| ";
}

void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
  map<int, int> countE;
  map<int, int> alignmentE;
  int totalCount = 0;
  int currentCount = 0;
  int maxSameCount = 0;
  int maxSame = -1;
  int old = -1;
  for(int i=0;i<phrasePair.size();i++) {
    if (i>0) {
      if (phrasePair[old].english == phrasePair[i].english) {
				if (! phrasePair[i].equals( phrasePair[old] )) {
					if (currentCount > maxSameCount) {
						maxSameCount = currentCount;
						maxSame = i-1;
					}
					currentCount = 0;
				}
			}
      else {
				// wrap up old E
				if (currentCount > maxSameCount) {
					maxSameCount = currentCount;
					maxSame = i-1;
				}

				alignmentE[ phrasePair[old].english ] = maxSame;
				//	if (maxSameCount != totalCount)
				//  cout << "max count is " << maxSameCount << "/" << totalCount << endl;
				
				// get ready for new E
				totalCount = 0;
				currentCount = 0;
				maxSameCount = 0;
				maxSame = -1;
			}
    }
    countE[ phrasePair[i].english ]++;
    old = i;
    currentCount++;
    totalCount++;
  }
  
  // wrap up old E
  if (currentCount > maxSameCount) {
    maxSameCount = currentCount;
    maxSame = phrasePair.size()-1;
  }
  alignmentE[ phrasePair[old].english ] = maxSame;
  //  if (maxSameCount != totalCount)
  //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;

  // output table
  typedef map< int, int >::iterator II;
  PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
	size_t index = 0;
  for(II i = countE.begin(); i != countE.end(); i++) {
    //    cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
		//cerr << index << endl;

    // foreign phrase (unless inverse)
    if (! inverseFlag) {
      for(int j=0;j<phraseF.size();j++)
			{
				//cerr << vcbF.getWord( phraseF[j] ) << " ";
				phraseTableFile << vcbF.getWord( phraseF[j] );
				phraseTableFile << " ";
			}
			//cerr << endl;
      phraseTableFile << "||| ";
		}

    // english phrase
    PHRASE phraseE = phraseTableE.getPhrase( i->first );
		for(int j=0;j<phraseE.size();j++)
		{
			//if ( vcbE.getWord( phraseE[j] ) == "herr")
			//	cerr << "";
			//cerr << vcbE.getWord( phraseE[j] ) << " ";
      phraseTableFile << vcbE.getWord( phraseE[j] );
			phraseTableFile << " ";
		}
		//cerr << endl;
    phraseTableFile << "||| ";

    // foreign phrase (if inverse)
    if (inverseFlag) {
      for(int j=0;j<phraseF.size();j++)
			{
				//cerr << vcbF.getWord( phraseF[j] ) << " ";
				phraseTableFile << vcbF.getWord( phraseF[j] );
				phraseTableFile << " ";
			}
			//cerr << endl;
      phraseTableFile << "||| ";
		}
 
		// merge all alignments 
		AlignmentPhrase alignementF(phraseF.size())
										,alignementE(phraseE.size());

		size_t numExamples = i->second;
		for (size_t currExample = index ; currExample < index + numExamples ; ++currExample)
		{
			vector< vector<size_t> > &currAlignmentF	= phrasePair[currExample].alignedToF
														,&currAlignmentE = phrasePair[currExample].alignedToE;
			alignementF.Merge(currAlignmentF);
			alignementE.Merge(currAlignmentE);
		}

    if (! inverseFlag) 
		{
			outputAlignment(alignementF);
		}

		outputAlignment(alignementE);

    if ( inverseFlag) 
		{
			outputAlignment(alignementF);
		}

		// phrase translation probability
    phraseTableFile << ((double) i->second / (double) phrasePair.size());

    // lexical translation probability
    double lexScore = 1;
    int null = vcbF.getWordID("NULL");
    PhraseAlignment &current = phrasePair[ alignmentE[ i->first ] ];
    for(int ei=0;ei<phraseE.size();ei++) { // all english words have to be explained
      if (current.alignedToE[ ei ].size() == 0)
	lexScore *= lexTable.ltable[ null ][ phraseE[ ei ] ]; // by NULL if neccessary
      else {
	double thisWordScore = 0;
	for(int j=0;j<current.alignedToE[ ei ].size();j++) {
	  thisWordScore += lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ];
	  //	  cout << "lex" << j << "(" << vcbE.getWord( phraseE[ ei ] ) << "|" << vcbF.getWord( phraseF[current.alignedToE[ ei ][ j ] ] ) << ")=" << lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ] << " ";
	}
	lexScore *= thisWordScore / (double)current.alignedToE[ ei ].size();
      }
      //      cout << " => " << lexScore << endl;
    }
    phraseTableFile << " " << lexScore;

    // model 1 score

    // zens&ney lexical score

    phraseTableFile << endl;

		index += i->second;
  }
}

void PhraseAlignment::create( char line[], int lineID ) {
  vector< string > token = tokenize( line );
  int item = 1;
  PHRASE phraseF, phraseE;
  for (int j=0; j<token.size(); j++) {
    if (token[j] == "|||") item++;
    else {
      if (item == 1)
	phraseF.push_back( vcbF.storeIfNew( token[j] ) );
      else if (item == 2)
	phraseE.push_back( vcbE.storeIfNew( token[j] ) );
      else if (item == 3) {
	int e,f;
	sscanf(token[j].c_str(), "%d-%d", &f, &e);
	if (e >= phraseE.size() || f >= phraseF.size()) { 
	  cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
	else {
	  if (alignedToE.size() == 0) {
	    vector< size_t > dummy;
	    for(int i=0;i<phraseE.size();i++)
	      alignedToE.push_back( dummy );
	    for(int i=0;i<phraseF.size();i++)
	      alignedToF.push_back( dummy );
	    foreign = phraseTableF.storeIfNew( phraseF );
	    english = phraseTableE.storeIfNew( phraseE );
	  }
	  alignedToE[e].push_back( f );
	  alignedToF[f].push_back( e );
	}
      }
    }
  }
}

void PhraseAlignment::clear() {
  for(int i=0;i<alignedToE.size();i++)
    alignedToE[i].clear();
  for(int i=0;i<alignedToF.size();i++)
    alignedToF[i].clear();
  alignedToE.clear();
  alignedToF.clear();
}

bool PhraseAlignment::equals( const PhraseAlignment& other ) {
  if (this == &other) return true;
  if (other.english != english) return false;
  if (other.foreign != foreign) return false;
  PHRASE phraseE = phraseTableE.getPhrase( english );
  PHRASE phraseF = phraseTableF.getPhrase( foreign );
  for(int i=0;i<phraseE.size();i++) {
    if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
    for(int j=0; j<alignedToE[i].size(); j++) {
      if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
    }
  }
  for(int i=0;i<phraseF.size();i++) {
    if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
    for(int j=0; j<alignedToF[i].size(); j++) {
      if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
    }
  }
  return true;
}

void LexicalTable::load( char *fileName ) {
  cerr << "Loading lexical translation table from " << fileName;
  ifstream inFile;
  inFile.open(fileName);
  if (inFile.fail()) {
    cerr << " - ERROR: could not open file\n";
    exit(1);
  }
  istream *inFileP = &inFile;

  char line[LINE_MAX_LENGTH];

  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
    if (inFileP->eof()) break;

    vector<string> token = tokenize( line );
    if (token.size() != 3) {
      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
	token.size() << " " << token[0] << " " << line << endl;
      continue;
    }
    
    double prob = atof( token[2].c_str() );
    WORD_ID wordE = vcbE.storeIfNew( token[0] );
    WORD_ID wordF = vcbF.storeIfNew( token[1] );
    ltable[ wordF ][ wordE ] = prob;
  }
  cerr << endl;
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?