translationoptioncollection.cpp.svn-base
来自「moses开源的机器翻译系统」· SVN-BASE 代码 · 共 593 行 · 第 1/2 页
SVN-BASE
593 行
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA***********************************************************************/#include <algorithm>#include "TranslationOptionCollection.h"#include "Sentence.h"#include "DecodeStep.h"#include "LanguageModel.h"#include "PhraseDictionaryMemory.h"#include "FactorCollection.h"#include "InputType.h"#include "Util.h"#include "StaticData.h"#include "DecodeStepTranslation.h"#include "DecodeGraph.h"using namespace std;/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here * This fn should be called by inherited classes*/TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage) : m_source(src) ,m_futureScore(src.GetSize()) ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage){ // create 2-d vector size_t size = src.GetSize(); for (size_t startPos = 0 ; startPos < size ; ++startPos) { m_collection.push_back( vector< TranslationOptionList >() ); size_t maxSize = size - startPos + 1; size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase; for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { m_collection[startPos].push_back( TranslationOptionList() ); } }}/** destructor, clears out data structures */TranslationOptionCollection::~TranslationOptionCollection(){ size_t size = m_source.GetSize(); for (size_t startPos = 0 ; startPos < size ; ++startPos) { size_t maxSize = size - startPos + 1; size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase; for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { RemoveAllInColl(GetTranslationOptionList(startPos, endPos)); } } RemoveAllInColl(m_unksrcs);}/** helper for pruning */bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b){ return a->GetFutureScore() > b->GetFutureScore();}void TranslationOptionCollection::Prune(){ size_t size = m_source.GetSize(); // prune to max no. of trans opt if (m_maxNoTransOptPerCoverage == 0) return; size_t total = 0; size_t totalPruned = 0; for (size_t startPos = 0 ; startPos < size; ++startPos) { size_t maxSize = size - startPos; size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase; for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos); total += fullList.size(); if (fullList.size() <= m_maxNoTransOptPerCoverage) continue; // sort in vector nth_element(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption); totalPruned += fullList.size() - m_maxNoTransOptPerCoverage; // delete the rest for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i) { delete fullList[i]; } fullList.resize(m_maxNoTransOptPerCoverage); } } VERBOSE(2," Total translation options: " << total << std::endl << "Total translation options pruned: " << totalPruned << std::endl);}/** Force a creation of a translation option where there are none for a particular source position.* ie. where a source word has not been translated, create a translation option by* 1. not observing the table limits on phrase/generation tables* 2. using the handler ProcessUnknownWord()* Call this function once translation option collection has been filled with translation options** This function calls for unknown words is complicated by the fact it must handle different input types. * The call stack is* Base::ProcessUnknownWord()* Inherited::ProcessUnknownWord(position)* Base::ProcessOneUnknownWord()** \param decodeStepList list of decoding steps* \param factorCollection input sentence with all factors*/void TranslationOptionCollection::ProcessUnknownWord(const std::vector <DecodeGraph*> &decodeStepVL){ size_t size = m_source.GetSize(); // try to translation for coverage with no trans by expanding table limit for (size_t startVL = 0 ; startVL < decodeStepVL.size() ; startVL++) { const DecodeGraph &decodeStepList = *decodeStepVL[startVL]; for (size_t pos = 0 ; pos < size ; ++pos) { TranslationOptionList &fullList = GetTranslationOptionList(pos, pos); size_t numTransOpt = fullList.size(); if (numTransOpt == 0) { CreateTranslationOptionsForRange(decodeStepList , pos, pos, false); } } } bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption(); // create unknown words for 1 word coverage where we don't have any trans options for (size_t pos = 0 ; pos < size ; ++pos) { TranslationOptionList &fullList = GetTranslationOptionList(pos, pos); if (fullList.size() == 0 || alwaysCreateDirectTranslationOption) ProcessUnknownWord(pos); }}/** special handling of ONE unknown words. Either add temporarily add word to translation table, * or drop the translation. * This function should be called by the ProcessOneUnknownWord() in the inherited class * At the moment, this unknown word handler is a bit of a hack, if copies over each factor from source * to target word, or uses the 'UNK' factor. * Ideally, this function should be in a class which can be expanded upon, for example, * to create a morphologically aware handler. * * \param sourceWord the unknown word * \param sourcePos * \param factorCollection input sentence with all factors */void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord, size_t sourcePos, size_t length){ // unknown word, add as trans opt FactorCollection &factorCollection = FactorCollection::Instance(); size_t isDigit = 0; if (StaticData::Instance().GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const string &s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* m_unksrc = new Phrase(Input); m_unksrc->AddWord() = sourceWord; m_unksrcs.push_back(m_unksrc); TranslationOption *transOpt; if (! StaticData::Instance().GetDropUnknown() || isDigit) { // add to dictionary TargetPhrase targetPhrase(Output); Word &targetWord = targetPhrase.AddWord(); for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) { FactorType factorType = static_cast<FactorType>(currFactor); const Factor *sourceFactor = sourceWord[currFactor]; if (sourceFactor == NULL) targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR); else targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString()); } targetPhrase.SetScore(); targetPhrase.SetSourcePhrase(m_unksrc); transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0); } else { // drop source word. create blank trans opt TargetPhrase targetPhrase(Output); targetPhrase.SetSourcePhrase(m_unksrc); transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0); } transOpt->CalcScore(); Add(transOpt);}/** compute future score matrix in a dynamic programming fashion. * This matrix used in search. * Call this function once translation option collection has been filled with translation options*/void TranslationOptionCollection::CalcFutureScore(){ // setup the matrix (ignore lower triangle, set upper triangle to -inf size_t size = m_source.GetSize(); // the width of the matrix for(size_t row=0; row<size; row++) { for(size_t col=row; col<size; col++) { m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity()); } } // walk all the translation options and record the cheapest option for each span for (size_t startPos = 0 ; startPos < m_source.GetSize() ; ++startPos) { size_t maxSize = m_source.GetSize() - startPos; size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase; for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos); TranslationOptionList::const_iterator iterTransOpt; for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) { const TranslationOption &transOpt = **iterTransOpt; float score = transOpt.GetFutureScore(); if (score > m_futureScore.GetScore(startPos, endPos)) m_futureScore.SetScore(startPos, endPos, score); } } } // now fill all the cells in the strictly upper triangle // there is no way to modify the diagonal now, in the case // where no translation option covers a single-word span, // we leave the +inf in the matrix // like in chart parsing we want each cell to contain the highest score // of the full-span trOpt or the sum of scores of joining two smaller spans for(size_t colstart = 1; colstart < size ; colstart++) { for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) { size_t startPos = diagshift; size_t endPos = colstart+diagshift; for(size_t joinAt = startPos; joinAt < endPos ; joinAt++) { float joinedScore = m_futureScore.GetScore(startPos, joinAt) + m_futureScore.GetScore(joinAt+1, endPos); /* // uncomment to see the cell filling scheme TRACE_ERR( "[" <<startPos<<","<<endPos<<"] <-? ["<<startPos<<","<<joinAt<<"]+["<<joinAt+1<<","<<endPos << "] (colstart: "<<colstart<<", diagshift: "<<diagshift<<")"<<endl); */ if (joinedScore > m_futureScore.GetScore(startPos, endPos)) m_futureScore.SetScore(startPos, endPos, joinedScore); } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?