📄 trans_06.cc
字号:
// long current_tran_token = 0; String atype_00; // tokenize the transcription and generate the graph // long pos = 0; // long syntac_num = 0; // long current_syn = 0; String syn_string; long skip_token = 0; // while (transcription.tokenize(atype_00, pos)) { atype_00.assign(transcription); Long x = transcription.firstStr(L":", 0); x.debug(L"x="); transcription.deleteRange(0, (long)x+1); transcription.trim(); transcription.debug(L"TR"); atype_00.deleteRange(x, atype_00.length()-x); atype_00.debug(L"atype"); // get one token // atype_00.trim(); String time_00; while (atype_00.tokenize(time_00, pos)) { skip_token++; if (skip_token ==1) offset_00.assign(time_00); else if (skip_token == 2) offset_01.assign(time_00); else if (skip_token == 3) channel_00.assign(time_00); } long channel_x = 0; if ( channel_00.eq(L"A")) { channel_00.debug(L"channel A:"); channel_x = 0; } else if (channel_00.eq(L"B")) { channel_00.debug(L"channel B:"); channel_x = 1; } if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } // insert the word to AG // newid_00 = angr_00.createAnchor(name_00, offset_00, unit_00); newid_01 = angr_00.createAnchor(name_00, offset_01, unit_00); ancr_00 = angr_00.getAnchorById(newid_00); ancr_01 = angr_00.getAnchorById(newid_01); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, transcription, channel_x); if (!angr_00.setFeature(newid_02, feat_00, value_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } } // end of transcription tokenize // test the insert method // if (!insertRecord(trans_file, angr_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr_00.clear(); // move one forward for file count // num_file++; if (debug_level_d >= Integral::DETAILED) { Long(num_file).debug(L"total number of file processed="); trans_file.debug(L"file name"); transcription.debug(L"transcription"); } // debug(L"upto now"); } while (sdb_a.gotoNext()); return true;}// method: load//// arguments:// Sdb& sdb: (input) sdb id list// Filename& trans_file: (input) transcription file // Filename& lexicon_file: (input) lexicon file// boolean flag: (input) flag to indicate if the time information exist//// return: logical error status//// this method load data files to the transcription database//boolean TranscriptionDatabase::load(Sdb& sdb_a, Filename& trans_file_a, Filename& lexicon_file_a, boolean flag_a) { // loop from start // if (!sdb_a.gotoFirst()) { String msg(L"Error: no input file specified "); Console::put(msg); Error::handle(name(), L"load", Error::NO_PARAM_FILE, __FILE__, __LINE__); } Filename trans_file; Sof transcription_file; String transcription; long num_file = 0; // declare a string vector to store the transcription information // Vector<String> trans_vec; // open the input file in read mode // File read_trans_file; if (!read_trans_file.open(trans_file_a, File::READ_ONLY)) { Console::put(L"Error in opening transcription input file"); } // read the string lines // String input_line_01; while (read_trans_file.get(input_line_01)) { trans_vec.concat(input_line_01); } // close the input text file // read_trans_file.close(); long length_01 = trans_vec.length(); Long(length_01).debug(L"total lines in transcription = "); // declare the hashtable for the word and its pronunciation // HashTable<String, String> pronun_map_d; // open the input file in read lexicon // Vector<String> lexicon_symbol_list; // open the input file in read mode // File read_lexicon_file; if (!read_lexicon_file.open(lexicon_file_a, File::READ_ONLY)) { Console::put(L"Error in opening lexicon input file"); } // declare variables // String str; Vector<String> nonsp_def, pre_list, word_list, rule_list; // read each line // while (read_lexicon_file.get(str)) { str.debug(L"str"); // pre-process the input lexicon lines and merge the same lines // boolean same = false; for (long i = 0; i < pre_list.length(); i++) { if (str.eq(pre_list(i))) same = true; } if (!same) { pre_list.concat(str); } } read_lexicon_file.close(); // process each lexicon line after pre-processing in the pre_list // for (long i = 0; i < pre_list.length(); i++) { String head_word, symbol, sequence; long pos(0); String delim(L" "); String lex_str(pre_list(i)); // get the first word in the lexicon line // lex_str.tokenize(head_word, pos); String key_word = head_word; long alt_index = 0; while (pronun_map_d.containsKey(key_word)) { key_word.assign(head_word); key_word.concat(L"."); key_word.concat(alt_index++); } String rest_string; lex_str.tokenize(rest_string, pos, lex_str.length() - pos); rest_string.trim(); rest_string.debug(L"lexicon"); pronun_map_d.insert(key_word, &rest_string); } if (debug_level_d >= Integral::DETAILED) { pronun_map_d.debug(L"lexicon"); } String name_00(L"SPINE"); // create the annotation graph // String gtype_00(L"ORTHOGRAPHIC"); String ident_00(L"id_00"); String ident_01(L"id_01"); String ident_02(L"id_02"); String ident_03(L"id_03"); String ident_04(L"id_04"); String newid_00; String newid_01; String newid_02; String newid_03; String newid_04; String synid_00; String synid_01; Float offset_00(0.0); Float offset_01(0.0); Float offset_02(0.0); Anchor* ancr_00 = (Anchor*)NULL; Anchor* ancr_01 = (Anchor*)NULL; String unit_00(L"seconds"); String feat_00(L"level"); String value_00(L"syntactic"); String value_01(L"word"); String value_02(L"phoneme"); String channel_00; setDataBaseName(name_00); do { sdb_a.getName(trans_file); AnnotationGraph angr_00(name_00, gtype_00); // get the transcription // String transcription = trans_vec(num_file); // pre-processing transcription // String atype_00; atype_00.assign(transcription); long pos = 0; long skip_token = 0; long channel_x = 0; if (flag_a) { long x = transcription.firstStr(L":", 0); Long(x).debug(L"':' position ="); transcription.deleteRange(0, (long)x+1); transcription.trim(); transcription.debug(L"transcription"); atype_00.deleteRange(x, atype_00.length()-x); atype_00.debug(L"atype"); atype_00.trim(); // tokenize the transcription and generate the graph // String time_00; while (atype_00.tokenize(time_00, pos)) { skip_token++; if (skip_token ==1) offset_00.assign(time_00); else if (skip_token == 2) offset_01.assign(time_00); else if (skip_token == 3) channel_00.assign(time_00); } if ( channel_00.eq(L"A")) { channel_00.debug(L"channel A:"); channel_x = 0; } else if (channel_00.eq(L"B")) { channel_00.debug(L"channel B:"); channel_x = 1; } if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } } // end of flag_a long trans_token = transcription.countTokens(); long current_tran_token = 0; long token_count = 0; if (flag_a) { newid_00 = angr_00.createAnchor(name_00, offset_00, unit_00); } else { newid_00 = angr_00.createAnchor(name_00, unit_00); } synid_00 = newid_00; pos = 0; while (transcription.tokenize(atype_00, pos)) { // get one token // atype_00.trim(); token_count++; atype_00.debug(L"word:"); if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } // insert the word to AG // if (token_count == trans_token) { if (flag_a) { newid_01 = angr_00.createAnchor(name_00, offset_01, unit_00); } else { newid_01 = angr_00.createAnchor(name_00, unit_00); } } else { newid_01 = angr_00.createAnchor(name_00, unit_00); } ancr_00 = angr_00.getAnchorById(newid_00); ancr_01 = angr_00.getAnchorById(newid_01); ancr_00->debug(L"new_00"); ancr_01->debug(L"new_01"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, atype_00, channel_x); ancr_00->debug(L"new_00A"); ancr_01->debug(L"new_01A"); if (!angr_00.setFeature(newid_02, feat_00, value_01)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } ancr_00->debug(L"new_00AA"); ancr_01->debug(L"new_01AA"); // add phone level AG here // String key_phone = atype_00; String delim(L" "); long alt_index = 0; while (pronun_map_d.containsKey(key_phone)) { long pos(0); String symbol, sub_symbol; symbol.assign(*pronun_map_d.get(key_phone)); if (debug_level_d >= Integral::DETAILED) { symbol.debug(L"sub_symbol--------------------"); } long total_token = symbol.countTokens(); long token_number = 0; newid_03 = newid_00; while (symbol.tokenize(sub_symbol, pos, delim)) { if (token_number == total_token - 1) { newid_04 = newid_01; } else { newid_04 = angr_00.createAnchor(name_00, unit_00); } ancr_00 = angr_00.getAnchorById(newid_03); ancr_01 = angr_00.getAnchorById(newid_04); ancr_00->debug(L"new_00B"); ancr_01->debug(L"new_01B"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, sub_symbol); ancr_00->debug(L"new_00BB"); ancr_01->debug(L"new_01BB"); if (!angr_00.setFeature(newid_02, feat_00, value_02)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } ancr_00->debug(L"new_00BBB"); ancr_01->debug(L"new_01BBB"); newid_03 = newid_04; token_number++; } // end of while tokenize key_phone.assign(atype_00); key_phone.concat(L"."); key_phone.concat(alt_index++); if (debug_level_d >= Integral::DETAILED) { key_phone.debug(L"key_phone=============="); } } // end of while containsKey newid_00 = newid_01; current_tran_token++; if (current_tran_token == trans_token) { synid_01 = newid_01; ancr_00 = angr_00.getAnchorById(synid_00); ancr_01 = angr_00.getAnchorById(synid_01); //syn_string.trim(); ancr_00->debug(L"new_00C"); ancr_01->debug(L"new_01C"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, transcription, channel_x); ancr_00->debug(L"new_00D"); ancr_01->debug(L"new_01D"); if (!angr_00.setFeature(newid_02, feat_00, value_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // syn_string.clear(); ancr_00->debug(L"new_00E"); ancr_01->debug(L"new_01E"); synid_00 = newid_00; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -