📄 ctc_trellis_encoder.asm

📁 移动通讯PHY设计中用到的数据块的CTC编译模块
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
xr29 = r29 - r21;            xr28 = yr28;                k6 = k8+k31;;/////////////////////////////////////////////////////////////////// Main loop//// Data goes through this main loop two times://// 1) In the first two columns, implement H^31//    The three pieces described each have their own accumulator:  r13:12, r15:14, and r17:16//    The computation instructions for the second accumulator are indented by 1 space,//    the computation instructions for the third accumulator are indented by 2 spaces.//// 2) a) Compute the output of H^31/H^32 in r19:18//       The upper 32 bits, in r19, are the sum of the 3 accumulators.//       The lower 32 bits, in r18, are taken from the previous iteration's r19.////    b) In r27:26, compute the next state of the expanded polynomial H^31/H^32//       Note that the state only depends on the denominator, 1 + D^32 + D^96.//       Since we process 32 bits on each iteration (per A and B stream), the operation is//          - For the new 32 bits, add input, old r3, and old r1//          - Shift old state (old r3:0) 32 bits right and shift in new 32 bits.////    c) In r25:20, compute the outputs Y and W, as well as the state of the//       original encoder, s1, s2, and s3.//       Note that Y and W here are temporary outputs, since we haven't yet added//       the circulation state output.//       The outputs are sent to the k6 and k7 buffers.  Since data must go through//       the main loop twice, the first output is dummy.  The first time through,//       the output pointers k6 and k7 do not increment, therefore this dummy output//       is overwritten.//// Cycle count: 27 cycles/iteration  (consumes 64 input bits)/////////////////////////////////////////////////////////////////// Below:  yr19 = S3(b);                xr19 = S1(b)// Below:  yr18 = 1+D^2 = W(b)=S2(b);   xr22 = 1+D^3 = W(a)// Below:  yr24 = 1+D+D^2 = S1(b);      xr24 = 1+D^2+D^3 = Y(a)// Below:  yr22 = 1+D+D^2+D^3 = Y(b)// Below:  yr22 = Y(.); xr22 = W(.).align_code 8;_loop_64_bits:lr9:8 = lshift r3:2 by 1;          xr26 = [J8 += 1];              r15 = r15 xor r5;;   // [65 through 93]lr7:6 = lshift r3:2 by 2;          yr26 = [J9 += 1];              r17 = r17 xor r7;;   // [32 through 63]lr9:8 = lshift r3:2 by 4;       lr13:12 = r3:2 xor r9:8;;         // lr13:12 contains delays [0 1] lr11:10 = lshift r1:0 by 4;                                      r21 = r11 xor r15;  r18 = r19;; lr15:14 = lshift r1:0 by 5;    lr13:12 = r13:12 xor r7:6;;       // [0 1 2]  r17 = lshift r1 by -1;                                          r19 = r21 xor r17;;  r5 = lshift r1 by -2;                                           r27 = r1 xor r3;;lr7:6 = lshift r13:12 by 28;    lr13:12 = r13:12 xor r9:8;;       // [0 1 2 4] lr11:10 = lshift r1:0 by 6;     lr15:14 = r15:14 xor r11:10;;    // [68 69]  r5 = lshift r1 by -3;           r17 = r17 xor r5;;              // [62 63]lr9:8 = lshift r13:12 by 7;                                       r27 = r26 xor r27;; lr11:10 = lshift r1:0 by 8;     lr15:14 = r15:14 xor r11:10;;    // [68 69 70]                                lr13:12 = r13:12 xor r9:8;        lr21:20 = lshift r19:18 by 3;; // [0 1 2 4 7 8 9 11]  r5 = lshift r1 by -6;           r17 = r17 xor r5;;              // [61 62 63]lr9:8 = lshift r13:12 by 14;    lr13:12 = r13:12 xor r7:6;;       // [0 through 11 and 28 29 30]                                 lr15:14 = r15:14 xor r11:10;     lr23:22 = lshift r19:18 by 2;; // [68 69 70 72]                                  r17 = r17 xor r5;               lr25:24 = lshift r19:18 by 1;; // [58 61 62 63] lr5:4 = lshift r15:14 by 7;                                      yr18 = r19 xor r23; xr22 = r19 xor r21;;  r7 = lshift r17 by -7;        lr13:12 = r13:12 xor r9:8;        xr20 = yr18;;                  // [0 through 30] lr11:10 = lshift r1:0 by 1;                                      yr24 = r18 xor r25; xr24 = r22 xor r23;;lr9:8 = lshift r13:12 by 33;     lr15:14 = r15:14 xor r5:4;       yr20 = xr24;;            // [68 69 70 72 75 76 77 79]                                                                  yr22 = r24 xor r21;  r0 = lshift r1 by 0;  r1 = r2;;  r7 = lshift r17 by -28;         r17 = r17 xor r7;;              // [51 54 55 56 58 61 62 63]                                                                  r22  = r22 xor r20;  r2 = lshift r3 by 0; r3 = r27;; lr5:4 = lshift r15:14 by 14;    r15 = r15 xor r11;               [k7+=K4] = xr22;; // [65 68 69 70 72 75 76 77 79]  r7 = lshift r17 by -14;         r17 = r17 xor r7;               [k6+=K4] = yr22;; // [51 through 63 and 32 through 35]if nlc0e, jump _loop_64_bits;   r11 = r13 xor r9;                 k4 = k31+1;;  // [0 through 63]/////////////////////////////////////////////////////////////////// Circulation state computation//// We first need to compute the final state of the original encoder// (i.e., the one defined in the spec).// This is defined as 4*s1 + 2*s2 + s3.////  - For the A computation, [s1 s2 s3 -] are in 4 consecutive bits in xr19.//    We just need to shift r19 by the value computed above in r28 and//    mask out the lowest 3 bits.////  - For the B computation, s1, s2, and s3 have been computed separately in//    yr24, yr18, and yr19.  We shift these registers appropriately, mask//    out the lowest bits, and add.//// Also, take the N mod 7 result in xr29 and compute//            8 * (( N mod 7)-1)// as the offset to the circulation state table./////////////////////////////////////////////////////////////////yr15:14 = lshift r19:18 by r28; yr26 = 4;             xr15 = lshift r19 by r28;;yr21    = lshift r24    by r28;                       xr14 = 7;;yr14 = r14 and r26;;yr15 = r15 and r26;                                   xr15 = r15 and r14;;yr21 = r21 and r26;     yr14 = lshift r14 by -1;      yr22 = xr15;;yr15 = lshift r15 by -2;                              xr20 = dec r29;;yr14 = r14 + r21;;                                                      xr20 = lshift r20 by 3;;yr15 = r15 + r14;                                     yr24 = xr20;;// stall// Combine A and B final statesyr15 = r15 xor r22;;// If circulation state is 0, we are already done.// If not 0, need to encode circulation state output for Y and W.if yale, jump _end_y_w (NP);;/////////////////////////////////////////////////////////////////// Add in circulation state component//// Do table lookup to get index into the buffer that has the precomputed// trellis output with zero input starting from circulation state.// The precomputed buffer is accessed as a circular buffer of length// 28.  (The pattern repeats every 7 bits, but we also want an integer// number of quadwords to facilitate the use of the DAB.  There is no// need for separate Y and W buffers since the zero-input results// are shifts of each other.)//// In the second column, compute the number of loop iterations.// We do 128 bits per loop iteration// Number of Y bit is N, equal to//  N = num_input_bytes * 8/2 = num_input_bytes/4//  Therefore, loop iterations = (num_input_bytes*4 + 127)/128/////////////////////////////////////////////////////////////////yr16 = r15 + r24;         j12 = _circ_state_table;;j13 = yr16;;// 2 stallsjl0 = 28;                                 xr20 = lshift r30 by 2;;jb0 = _circ_state_output;;j14 = [j12 + j13];                        xr21 = 127;;k6 = k8+k31;;k7 = k9+k31;;// 2 stallsj0 = j14 + _circ_state_output;;j1 = j14 + _circ_state_output+6;          xr20 = r20 + r21;;jl1 = 28;;jb1 = _circ_state_output;                 xr20 = lshift r20 by -7;;lc0 = xr20;;r3:0 = q[k6 += k31];;// Initial dummy DAB readr7:4 = dab q[j0 += 4];;r7:4 = dab q[j0 += 4];;lc1 = xr20;;.align_code 4;_circ_y_loop_128_bits:xlr9:8 = r1:0 xor r5:4;     ylr9:8 = r3:2 xor r7:6;   r7:4 = dab q[j0 += 4]; r3:0 = q[k6 + 4];;if nlc0e, jump _circ_y_loop_128_bits;   q[k6 += 4] = yxr9:8;;// Do the same for Wr3:0 = q[k7 += k31];;r7:4 = dab q[j1 += 4];;r7:4 = dab q[j1 += 4];;// 1 stall.align_code 4;_circ_w_loop_128_bits:xlr9:8 = r1:0 xor r5:4;     ylr9:8 = r3:2 xor r7:6;   r7:4 = dab q[j1 += 4]; r3:0 = q[k7 + 4];;if nlc1e, jump _circ_w_loop_128_bits;   q[k7 += 4] = yxr9:8;;.align_code 4;_end_y_w:// If this is the first time here, set up to run again, this time on the interleaved data.// If this is the second time, the computations below are harmless and we go on to the epilogue.k4 = k31 + k31;     j8 = j10 + j31;   r19:18 = r19:18 - r19:18;     sr17:16 = lshift r17:16 by 16;;r3:0 = r19:16;      j9 = j11 + j31;;r7:4 = r19:16;      sr11 = lshift r11 by 16;        xr3 = [j8 += 1];;r15:12 = r19:16;                                    yr3 = [j9 += 1];;k7:6 = k11:10;      k5 = k5 + 1;;if keq, jump _loop_64_bits;   lc0 = xr31;           k9:8 = k11:10;;//-----------------EPILOGUE:--------------------------------//    yr27:24 = q[k27 + 16];      xr27:24 = q[j27 + 24];;    yr31:28 = q[k27 + 12];      xr31:28 = q[j27 + 20];;    cjmp (ABS); j27:24=q[j26+68]; k27:24=q[k26+68]; nop;;//---------------------------------------------------------//_ctc_trellis_encoder.end:
上一页 12
💿 文件大小 6 K
👤 上传用户 dedien
📂 所属分类 DSP编程
🏷️ 相关标签

#PHY #CTC #移动通讯 #数据
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -