⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 score.cpp.svn-base

📁 moses开源的机器翻译系统
💻 SVN-BASE
字号:
// $Id$// vim:tabstop=2#include <sstream>#include <cstdio>#include <iostream>#include <fstream>#include <vector>#include <string>#include <stdlib.h>#include <assert.h>#include <time.h>#include "AlignmentPhrase.h"#include "tables-core.h"using namespace std;#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}#define LINE_MAX_LENGTH 10000class PhraseAlignment {public:  int english, foreign;  vector< vector<size_t> > alignedToE;  vector< vector<size_t> > alignedToF;    void create( char*, int );  void clear();  bool equals( const PhraseAlignment& );};class LexicalTable {public:  map< WORD_ID, map< WORD_ID, double > > ltable;  void load( char[] );};vector<string> tokenize( char [] );void processPhrasePairs( vector< PhraseAlignment > & );ofstream phraseTableFile;Vocabulary vcbE;Vocabulary vcbF;LexicalTable lexTable;PhraseTable phraseTableE;PhraseTable phraseTableF;bool inverseFlag;int main(int argc, char* argv[]) {  cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n"       << "phrase scoring methods for extracted phrases\n";  time_t starttime = time(NULL);  if (argc != 4 && argc != 5) {    cerr << "syntax: phrase-score extract lex phrase-table [inverse]\n";    exit(1);  }  char* &fileNameExtract = argv[1];  char* &fileNameLex = argv[2];  char* &fileNamePhraseTable = argv[3];  inverseFlag = false;  if (argc > 4) {    inverseFlag = true;    cerr << "using inverse mode\n";  }  //  char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted";  //  string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n";  //  string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n";  // lexical translation table  lexTable.load( fileNameLex );    // sorted phrase extraction file  ifstream extractFile;  extractFile.open(fileNameExtract);  if (extractFile.fail()) {    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;    exit(1);  }  istream &extractFileP = extractFile;  // output file: phrase translation table  phraseTableFile.open(fileNamePhraseTable);  if (phraseTableFile.fail()) {    cerr << "ERROR: could not open file phrase table file " 	 << fileNamePhraseTable << endl;    exit(1);  }    // loop through all extracted phrase translations  int lastForeign = -1;  vector< PhraseAlignment > phrasePairsWithSameF;  int i=0;  int fileCount = 0;  while(true) {    if (extractFileP.eof()) break;    if (++i % 100000 == 0) cerr << "." << flush;    char line[LINE_MAX_LENGTH];        SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n');    //    if (fileCount>0)    if (extractFileP.eof()) break;    PhraseAlignment phrasePair;    phrasePair.create( line, i );    if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {      processPhrasePairs( phrasePairsWithSameF );      for(int j=0;j<phrasePairsWithSameF.size();j++)	phrasePairsWithSameF[j].clear();      phrasePairsWithSameF.clear();      phraseTableE.clear();      phraseTableF.clear();      phrasePair.clear(); // process line again, since phrase tables flushed      phrasePair.create( line, i );     }    lastForeign = phrasePair.foreign;    phrasePairsWithSameF.push_back( phrasePair );  }  processPhrasePairs( phrasePairsWithSameF );  phraseTableFile.close();}void outputAlignment(const AlignmentPhrase &alignmentPhrase){	for (size_t posWord = 0 ; posWord < alignmentPhrase.GetSize() ; ++posWord)	{		stringstream strme("");		const AlignmentElement &alignmentElement = alignmentPhrase.GetElement(posWord);		AlignmentElement::const_iterator iterElement;		for (iterElement = alignmentElement.begin() ; iterElement != alignmentElement.end() ; ++iterElement)		{			size_t align = *iterElement;			strme << "," << align;		}		string str = strme.str();		if (str.size() > 0)			str = str.substr(1, str.size() - 1);		phraseTableFile << "(" << str << ") ";	}	phraseTableFile << "||| ";}void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {  map<int, int> countE;  map<int, int> alignmentE;  int totalCount = 0;  int currentCount = 0;  int maxSameCount = 0;  int maxSame = -1;  int old = -1;  for(int i=0;i<phrasePair.size();i++) {    if (i>0) {      if (phrasePair[old].english == phrasePair[i].english) {				if (! phrasePair[i].equals( phrasePair[old] )) {					if (currentCount > maxSameCount) {						maxSameCount = currentCount;						maxSame = i-1;					}					currentCount = 0;				}			}      else {				// wrap up old E				if (currentCount > maxSameCount) {					maxSameCount = currentCount;					maxSame = i-1;				}				alignmentE[ phrasePair[old].english ] = maxSame;				//	if (maxSameCount != totalCount)				//  cout << "max count is " << maxSameCount << "/" << totalCount << endl;								// get ready for new E				totalCount = 0;				currentCount = 0;				maxSameCount = 0;				maxSame = -1;			}    }    countE[ phrasePair[i].english ]++;    old = i;    currentCount++;    totalCount++;  }    // wrap up old E  if (currentCount > maxSameCount) {    maxSameCount = currentCount;    maxSame = phrasePair.size()-1;  }  alignmentE[ phrasePair[old].english ] = maxSame;  //  if (maxSameCount != totalCount)  //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;  // output table  typedef map< int, int >::iterator II;  PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );	size_t index = 0;  for(II i = countE.begin(); i != countE.end(); i++) {    //    cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";		//cerr << index << endl;    // foreign phrase (unless inverse)    if (! inverseFlag) {      for(int j=0;j<phraseF.size();j++)			{				//cerr << vcbF.getWord( phraseF[j] ) << " ";				phraseTableFile << vcbF.getWord( phraseF[j] );				phraseTableFile << " ";			}			//cerr << endl;      phraseTableFile << "||| ";		}    // english phrase    PHRASE phraseE = phraseTableE.getPhrase( i->first );		for(int j=0;j<phraseE.size();j++)		{			//if ( vcbE.getWord( phraseE[j] ) == "herr")			//	cerr << "";			//cerr << vcbE.getWord( phraseE[j] ) << " ";      phraseTableFile << vcbE.getWord( phraseE[j] );			phraseTableFile << " ";		}		//cerr << endl;    phraseTableFile << "||| ";    // foreign phrase (if inverse)    if (inverseFlag) {      for(int j=0;j<phraseF.size();j++)			{				//cerr << vcbF.getWord( phraseF[j] ) << " ";				phraseTableFile << vcbF.getWord( phraseF[j] );				phraseTableFile << " ";			}			//cerr << endl;      phraseTableFile << "||| ";		} 		// merge all alignments 		AlignmentPhrase alignementF(phraseF.size())										,alignementE(phraseE.size());		size_t numExamples = i->second;		for (size_t currExample = index ; currExample < index + numExamples ; ++currExample)		{			vector< vector<size_t> > &currAlignmentF	= phrasePair[currExample].alignedToF														,&currAlignmentE = phrasePair[currExample].alignedToE;			alignementF.Merge(currAlignmentF);			alignementE.Merge(currAlignmentE);		}    if (! inverseFlag) 		{			outputAlignment(alignementF);		}		outputAlignment(alignementE);    if ( inverseFlag) 		{			outputAlignment(alignementF);		}		// phrase translation probability    phraseTableFile << ((double) i->second / (double) phrasePair.size());    // lexical translation probability    double lexScore = 1;    int null = vcbF.getWordID("NULL");    PhraseAlignment &current = phrasePair[ alignmentE[ i->first ] ];    for(int ei=0;ei<phraseE.size();ei++) { // all english words have to be explained      if (current.alignedToE[ ei ].size() == 0)	lexScore *= lexTable.ltable[ null ][ phraseE[ ei ] ]; // by NULL if neccessary      else {	double thisWordScore = 0;	for(int j=0;j<current.alignedToE[ ei ].size();j++) {	  thisWordScore += lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ];	  //	  cout << "lex" << j << "(" << vcbE.getWord( phraseE[ ei ] ) << "|" << vcbF.getWord( phraseF[current.alignedToE[ ei ][ j ] ] ) << ")=" << lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ] << " ";	}	lexScore *= thisWordScore / (double)current.alignedToE[ ei ].size();      }      //      cout << " => " << lexScore << endl;    }    phraseTableFile << " " << lexScore;    // model 1 score    // zens&ney lexical score    phraseTableFile << endl;		index += i->second;  }}void PhraseAlignment::create( char line[], int lineID ) {  vector< string > token = tokenize( line );  int item = 1;  PHRASE phraseF, phraseE;  for (int j=0; j<token.size(); j++) {    if (token[j] == "|||") item++;    else {      if (item == 1)	phraseF.push_back( vcbF.storeIfNew( token[j] ) );      else if (item == 2)	phraseE.push_back( vcbE.storeIfNew( token[j] ) );      else if (item == 3) {	int e,f;	sscanf(token[j].c_str(), "%d-%d", &f, &e);	if (e >= phraseE.size() || f >= phraseF.size()) { 	  cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }	else {	  if (alignedToE.size() == 0) {	    vector< size_t > dummy;	    for(int i=0;i<phraseE.size();i++)	      alignedToE.push_back( dummy );	    for(int i=0;i<phraseF.size();i++)	      alignedToF.push_back( dummy );	    foreign = phraseTableF.storeIfNew( phraseF );	    english = phraseTableE.storeIfNew( phraseE );	  }	  alignedToE[e].push_back( f );	  alignedToF[f].push_back( e );	}      }    }  }}void PhraseAlignment::clear() {  for(int i=0;i<alignedToE.size();i++)    alignedToE[i].clear();  for(int i=0;i<alignedToF.size();i++)    alignedToF[i].clear();  alignedToE.clear();  alignedToF.clear();}bool PhraseAlignment::equals( const PhraseAlignment& other ) {  if (this == &other) return true;  if (other.english != english) return false;  if (other.foreign != foreign) return false;  PHRASE phraseE = phraseTableE.getPhrase( english );  PHRASE phraseF = phraseTableF.getPhrase( foreign );  for(int i=0;i<phraseE.size();i++) {    if (alignedToE[i].size() != other.alignedToE[i].size()) return false;    for(int j=0; j<alignedToE[i].size(); j++) {      if (alignedToE[i][j] != other.alignedToE[i][j]) return false;    }  }  for(int i=0;i<phraseF.size();i++) {    if (alignedToF[i].size() != other.alignedToF[i].size()) return false;    for(int j=0; j<alignedToF[i].size(); j++) {      if (alignedToF[i][j] != other.alignedToF[i][j]) return false;    }  }  return true;}void LexicalTable::load( char *fileName ) {  cerr << "Loading lexical translation table from " << fileName;  ifstream inFile;  inFile.open(fileName);  if (inFile.fail()) {    cerr << " - ERROR: could not open file\n";    exit(1);  }  istream *inFileP = &inFile;  char line[LINE_MAX_LENGTH];  int i=0;  while(true) {    i++;    if (i%100000 == 0) cerr << "." << flush;    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');    if (inFileP->eof()) break;    vector<string> token = tokenize( line );    if (token.size() != 3) {      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<	token.size() << " " << token[0] << " " << line << endl;      continue;    }        double prob = atof( token[2].c_str() );    WORD_ID wordE = vcbE.storeIfNew( token[0] );    WORD_ID wordF = vcbF.storeIfNew( token[1] );    ltable[ wordF ][ wordE ] = prob;  }  cerr << endl;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -