📄 ctc_trellis_encoder.asm
字号:
xr29 = r29 - r21; xr28 = yr28; k6 = k8+k31;;/////////////////////////////////////////////////////////////////// Main loop//// Data goes through this main loop two times://// 1) In the first two columns, implement H^31// The three pieces described each have their own accumulator: r13:12, r15:14, and r17:16// The computation instructions for the second accumulator are indented by 1 space,// the computation instructions for the third accumulator are indented by 2 spaces.//// 2) a) Compute the output of H^31/H^32 in r19:18// The upper 32 bits, in r19, are the sum of the 3 accumulators.// The lower 32 bits, in r18, are taken from the previous iteration's r19.//// b) In r27:26, compute the next state of the expanded polynomial H^31/H^32// Note that the state only depends on the denominator, 1 + D^32 + D^96.// Since we process 32 bits on each iteration (per A and B stream), the operation is// - For the new 32 bits, add input, old r3, and old r1// - Shift old state (old r3:0) 32 bits right and shift in new 32 bits.//// c) In r25:20, compute the outputs Y and W, as well as the state of the// original encoder, s1, s2, and s3.// Note that Y and W here are temporary outputs, since we haven't yet added// the circulation state output.// The outputs are sent to the k6 and k7 buffers. Since data must go through// the main loop twice, the first output is dummy. The first time through,// the output pointers k6 and k7 do not increment, therefore this dummy output// is overwritten.//// Cycle count: 27 cycles/iteration (consumes 64 input bits)/////////////////////////////////////////////////////////////////// Below: yr19 = S3(b); xr19 = S1(b)// Below: yr18 = 1+D^2 = W(b)=S2(b); xr22 = 1+D^3 = W(a)// Below: yr24 = 1+D+D^2 = S1(b); xr24 = 1+D^2+D^3 = Y(a)// Below: yr22 = 1+D+D^2+D^3 = Y(b)// Below: yr22 = Y(.); xr22 = W(.).align_code 8;_loop_64_bits:lr9:8 = lshift r3:2 by 1; xr26 = [J8 += 1]; r15 = r15 xor r5;; // [65 through 93]lr7:6 = lshift r3:2 by 2; yr26 = [J9 += 1]; r17 = r17 xor r7;; // [32 through 63]lr9:8 = lshift r3:2 by 4; lr13:12 = r3:2 xor r9:8;; // lr13:12 contains delays [0 1] lr11:10 = lshift r1:0 by 4; r21 = r11 xor r15; r18 = r19;; lr15:14 = lshift r1:0 by 5; lr13:12 = r13:12 xor r7:6;; // [0 1 2] r17 = lshift r1 by -1; r19 = r21 xor r17;; r5 = lshift r1 by -2; r27 = r1 xor r3;;lr7:6 = lshift r13:12 by 28; lr13:12 = r13:12 xor r9:8;; // [0 1 2 4] lr11:10 = lshift r1:0 by 6; lr15:14 = r15:14 xor r11:10;; // [68 69] r5 = lshift r1 by -3; r17 = r17 xor r5;; // [62 63]lr9:8 = lshift r13:12 by 7; r27 = r26 xor r27;; lr11:10 = lshift r1:0 by 8; lr15:14 = r15:14 xor r11:10;; // [68 69 70] lr13:12 = r13:12 xor r9:8; lr21:20 = lshift r19:18 by 3;; // [0 1 2 4 7 8 9 11] r5 = lshift r1 by -6; r17 = r17 xor r5;; // [61 62 63]lr9:8 = lshift r13:12 by 14; lr13:12 = r13:12 xor r7:6;; // [0 through 11 and 28 29 30] lr15:14 = r15:14 xor r11:10; lr23:22 = lshift r19:18 by 2;; // [68 69 70 72] r17 = r17 xor r5; lr25:24 = lshift r19:18 by 1;; // [58 61 62 63] lr5:4 = lshift r15:14 by 7; yr18 = r19 xor r23; xr22 = r19 xor r21;; r7 = lshift r17 by -7; lr13:12 = r13:12 xor r9:8; xr20 = yr18;; // [0 through 30] lr11:10 = lshift r1:0 by 1; yr24 = r18 xor r25; xr24 = r22 xor r23;;lr9:8 = lshift r13:12 by 33; lr15:14 = r15:14 xor r5:4; yr20 = xr24;; // [68 69 70 72 75 76 77 79] yr22 = r24 xor r21; r0 = lshift r1 by 0; r1 = r2;; r7 = lshift r17 by -28; r17 = r17 xor r7;; // [51 54 55 56 58 61 62 63] r22 = r22 xor r20; r2 = lshift r3 by 0; r3 = r27;; lr5:4 = lshift r15:14 by 14; r15 = r15 xor r11; [k7+=K4] = xr22;; // [65 68 69 70 72 75 76 77 79] r7 = lshift r17 by -14; r17 = r17 xor r7; [k6+=K4] = yr22;; // [51 through 63 and 32 through 35]if nlc0e, jump _loop_64_bits; r11 = r13 xor r9; k4 = k31+1;; // [0 through 63]/////////////////////////////////////////////////////////////////// Circulation state computation//// We first need to compute the final state of the original encoder// (i.e., the one defined in the spec).// This is defined as 4*s1 + 2*s2 + s3.//// - For the A computation, [s1 s2 s3 -] are in 4 consecutive bits in xr19.// We just need to shift r19 by the value computed above in r28 and// mask out the lowest 3 bits.//// - For the B computation, s1, s2, and s3 have been computed separately in// yr24, yr18, and yr19. We shift these registers appropriately, mask// out the lowest bits, and add.//// Also, take the N mod 7 result in xr29 and compute// 8 * (( N mod 7)-1)// as the offset to the circulation state table./////////////////////////////////////////////////////////////////yr15:14 = lshift r19:18 by r28; yr26 = 4; xr15 = lshift r19 by r28;;yr21 = lshift r24 by r28; xr14 = 7;;yr14 = r14 and r26;;yr15 = r15 and r26; xr15 = r15 and r14;;yr21 = r21 and r26; yr14 = lshift r14 by -1; yr22 = xr15;;yr15 = lshift r15 by -2; xr20 = dec r29;;yr14 = r14 + r21;; xr20 = lshift r20 by 3;;yr15 = r15 + r14; yr24 = xr20;;// stall// Combine A and B final statesyr15 = r15 xor r22;;// If circulation state is 0, we are already done.// If not 0, need to encode circulation state output for Y and W.if yale, jump _end_y_w (NP);;/////////////////////////////////////////////////////////////////// Add in circulation state component//// Do table lookup to get index into the buffer that has the precomputed// trellis output with zero input starting from circulation state.// The precomputed buffer is accessed as a circular buffer of length// 28. (The pattern repeats every 7 bits, but we also want an integer// number of quadwords to facilitate the use of the DAB. There is no// need for separate Y and W buffers since the zero-input results// are shifts of each other.)//// In the second column, compute the number of loop iterations.// We do 128 bits per loop iteration// Number of Y bit is N, equal to// N = num_input_bytes * 8/2 = num_input_bytes/4// Therefore, loop iterations = (num_input_bytes*4 + 127)/128/////////////////////////////////////////////////////////////////yr16 = r15 + r24; j12 = _circ_state_table;;j13 = yr16;;// 2 stallsjl0 = 28; xr20 = lshift r30 by 2;;jb0 = _circ_state_output;;j14 = [j12 + j13]; xr21 = 127;;k6 = k8+k31;;k7 = k9+k31;;// 2 stallsj0 = j14 + _circ_state_output;;j1 = j14 + _circ_state_output+6; xr20 = r20 + r21;;jl1 = 28;;jb1 = _circ_state_output; xr20 = lshift r20 by -7;;lc0 = xr20;;r3:0 = q[k6 += k31];;// Initial dummy DAB readr7:4 = dab q[j0 += 4];;r7:4 = dab q[j0 += 4];;lc1 = xr20;;.align_code 4;_circ_y_loop_128_bits:xlr9:8 = r1:0 xor r5:4; ylr9:8 = r3:2 xor r7:6; r7:4 = dab q[j0 += 4]; r3:0 = q[k6 + 4];;if nlc0e, jump _circ_y_loop_128_bits; q[k6 += 4] = yxr9:8;;// Do the same for Wr3:0 = q[k7 += k31];;r7:4 = dab q[j1 += 4];;r7:4 = dab q[j1 += 4];;// 1 stall.align_code 4;_circ_w_loop_128_bits:xlr9:8 = r1:0 xor r5:4; ylr9:8 = r3:2 xor r7:6; r7:4 = dab q[j1 += 4]; r3:0 = q[k7 + 4];;if nlc1e, jump _circ_w_loop_128_bits; q[k7 += 4] = yxr9:8;;.align_code 4;_end_y_w:// If this is the first time here, set up to run again, this time on the interleaved data.// If this is the second time, the computations below are harmless and we go on to the epilogue.k4 = k31 + k31; j8 = j10 + j31; r19:18 = r19:18 - r19:18; sr17:16 = lshift r17:16 by 16;;r3:0 = r19:16; j9 = j11 + j31;;r7:4 = r19:16; sr11 = lshift r11 by 16; xr3 = [j8 += 1];;r15:12 = r19:16; yr3 = [j9 += 1];;k7:6 = k11:10; k5 = k5 + 1;;if keq, jump _loop_64_bits; lc0 = xr31; k9:8 = k11:10;;//-----------------EPILOGUE:--------------------------------// yr27:24 = q[k27 + 16]; xr27:24 = q[j27 + 24];; yr31:28 = q[k27 + 12]; xr31:28 = q[j27 + 20];; cjmp (ABS); j27:24=q[j26+68]; k27:24=q[k26+68]; nop;;//---------------------------------------------------------//_ctc_trellis_encoder.end:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -