📄 mb_encode_kc.cpp

📁 H.264完整的C语言代码和DCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
    // The first four clusters get first, and the second half get second.
    first = commucperm(perm_A, first);
    second = commucperm(perm_B, second);

    c_idx = shift(y_idx, minus_one);
    y_idx = y_idx + two;
    CrCbY4Y3Y2Y1_a[16+c_idx] = select(low, first, second);
  }

  color_out << CrCbY4Y3Y2Y1_a[0];
  color_out << CrCbY4Y3Y2Y1_a[1];
  color_out << CrCbY4Y3Y2Y1_a[2];
  color_out << CrCbY4Y3Y2Y1_a[3];
  color_out << CrCbY4Y3Y2Y1_a[4];
  color_out << CrCbY4Y3Y2Y1_a[5];
  color_out << CrCbY4Y3Y2Y1_a[6];
  color_out << CrCbY4Y3Y2Y1_a[7];

  color_out << CrCbY4Y3Y2Y1_a[8];
  color_out << CrCbY4Y3Y2Y1_a[9];
  color_out << CrCbY4Y3Y2Y1_a[10];
  color_out << CrCbY4Y3Y2Y1_a[11];
  color_out << CrCbY4Y3Y2Y1_a[12];
  color_out << CrCbY4Y3Y2Y1_a[13];
  color_out << CrCbY4Y3Y2Y1_a[14];
  color_out << CrCbY4Y3Y2Y1_a[15];

  color_out << CrCbY4Y3Y2Y1_a[16];
  color_out << CrCbY4Y3Y2Y1_a[17];
  color_out << CrCbY4Y3Y2Y1_a[18];
  color_out << CrCbY4Y3Y2Y1_a[19];
  color_out << CrCbY4Y3Y2Y1_a[20];
  color_out << CrCbY4Y3Y2Y1_a[21];
  color_out << CrCbY4Y3Y2Y1_a[22];
  color_out << CrCbY4Y3Y2Y1_a[23];


  // DCT
  
  int eight = four + four;
  sixteen = eight + eight;
  half2 h2_one = 1 | half2(shift(1, sixteen));
  uhalf2 uh2_half = shift(uhalf2(h2_one), sixteen - 1);
  uhalf2 uh2_almost_half = uh2_half - uhalf2(h2_one);

  // calculate cluster dependent send and store indices

  int dest_idx = 0;
  int src_idx = 0;
  int idx0 = cid();
  int idx1 = (idx0 - 1) & seven;
  int idx2 = (idx1 - 1) & seven;
  int idx3 = (idx2 - 1) & seven;
  int idx4 = (idx3 - 1) & seven;
  int idx5 = (idx4 - 1) & seven;
  int idx6 = (idx5 - 1) & seven;
  int idx7 = (idx6 - 1) & seven;

  i = 3;
  loop_count(i) pipeline(67) {

  stage(1);

    // get a's from scratchpad -- in 16.0 format
    a0 = CrCbY4Y3Y2Y1_a[0+src_idx];
    a1 = CrCbY4Y3Y2Y1_a[1+src_idx];
    a2 = CrCbY4Y3Y2Y1_a[2+src_idx];
    a3 = CrCbY4Y3Y2Y1_a[3+src_idx];
    a4 = CrCbY4Y3Y2Y1_a[4+src_idx];
    a5 = CrCbY4Y3Y2Y1_a[5+src_idx];
    a6 = CrCbY4Y3Y2Y1_a[6+src_idx];
    a7 = CrCbY4Y3Y2Y1_a[7+src_idx];
    src_idx = src_idx + eight;
  
    // do the 1d dct
    half2 s16, s07, s25, s34, s1625, s0734;
  
    s07 = a0 + a7;
    s16 = a1 + a6;
    s25 = a2 + a5;
    s34 = a3 + a4;
    s1625 = s16 + s25;
    s0734 = s07 + s34;
  
    half2 d16, d07, d25, d34, d1625, d0734;
  
    d07 = a0 - a7;
    d16 = a1 - a6;
    d25 = a2 - a5;
    d34 = a3 - a4;
    d1625 = s16 - s25;
    d0734 = s07 - s34;

    half2 sd16d07, sd25d34;
  
    sd16d07 = d07 + d16;
    sd25d34 = d25 + d34;
  
    half2 m1_over_2, m2, m5, m6, m7, m8, m9;
  
    // All results in 16.0
    m1_over_2 = s0734 + s1625;
    m2 = s0734 - s1625;
    m5 = hi(COS_2 * shift(d1625 + d0734, two));
    m6 = hi(COS_2 * shift(d25 + d16, two));
    m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
    m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
    m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
  
    half2 s5, s6, s7, s8;
  
    s5 = d07 + m6;
    s6 = d07 - m6;
    s7 = m8 - m7;
    s8 = m9 - m7;
  
    array<half2> buf1(8);  // intermediate dct output.  ie, do rows
    array<half2> buf2(8);  //   then store here.  Then index into this
                           //   differently to get the columns

    // All results in 16.0
    buf1[0] = hi(K[0] * shift(m1_over_2, two));
    buf1[1] = hi(K[1] * shift(s5 + s7, two));
    buf1[2] = hi(K[2] * shift(d0734 + m5, two));
    buf1[3] = hi(K[3] * shift(s6 - s8, two));
    buf1[4] = hi(K[4] * shift(m2, two));
    buf1[5] = hi(K[5] * shift(s6 + s8, two));
    buf1[6] = hi(K[6] * shift(d0734 - m5, two));
    buf1[7] = hi(K[7] * shift(s5 - s7, two));
  
    // Do comm stuff to transpose the matrix to do rows now
  
    buf2[idx0] = buf1[idx0];
    buf2[idx7] = commucperm(perm_a, buf1[idx1]);
    buf2[idx6] = commucperm(perm_b, buf1[idx2]);
    buf2[idx5] = commucperm(perm_c, buf1[idx3]);
    buf2[idx4] = commucperm(perm_d, buf1[idx4]);
    buf2[idx3] = commucperm(perm_e, buf1[idx5]);
    buf2[idx2] = commucperm(perm_f, buf1[idx6]);
    buf2[idx1] = commucperm(perm_g, buf1[idx7]);
    
    // get a's from scratchpad -- In 16.0 format
    a0 = buf2[0];
    a1 = buf2[1];
    a2 = buf2[2];
    a3 = buf2[3];
    a4 = buf2[4];
    a5 = buf2[5];
    a6 = buf2[6];
    a7 = buf2[7];

  stage(2);

    s07 = a0 + a7;
    s16 = a1 + a6;
    s25 = a2 + a5;
    s34 = a3 + a4;
    s1625 = s16 + s25;
    s0734 = s07 + s34;
  
    d07 = a0 - a7;
    d16 = a1 - a6;
    d25 = a2 - a5;
    d34 = a3 - a4;
    d1625 = s16 - s25;
    d0734 = s07 - s34;
  
    sd16d07 = d07 + d16;
    sd25d34 = d25 + d34;
  
    // All results in 16.0
    m1_over_2 = s0734 + s1625;
    m2 = s0734 - s1625;
    m5 = hi(COS_2 * shift(d1625 + d0734, two));
    m6 = hi(COS_2 * shift(d25 + d16, two));
    m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
    m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
    m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
  
    s5 = d07 + m6;
    s6 = d07 - m6;
    s7 = m8 - m7;
    s8 = m9 - m7;
  
    d0 = m1_over_2;
    d1 = s5 + s7;
    d2 = d0734 + m5;
    d3 = s6 - s8;
    d4 = m2;
    d5 = s6 + s8;
    d6 = d0734 - m5;
    d7 = s5 - s7;
  
  
    // Round the quantized result such that 0.5 -> 1.0, and -0.5 -> -1.0.
  
    uhalf2 round_cmp;          // value to compare w/ fractional part of result
    double<half2> dct_quant;   // quantized dct coefficient
    cc sign, add;
  
    sign = itocc(int(d0 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[0+dest_idx] * shift(d0, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[0+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d1 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[1+dest_idx] * shift(d1, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[1+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d2 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[2+dest_idx] * shift(d2, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[2+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d3 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[3+dest_idx] * shift(d3, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[3+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d4 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[4+dest_idx] * shift(d4, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[4+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d5 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[5+dest_idx] * shift(d5, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[5+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d6 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[6+dest_idx] * shift(d6, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[6+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
  
    sign = itocc(int(d7 <= 0));
    round_cmp = select(sign, uh2_half, uh2_almost_half);
    dct_quant = quant[7+dest_idx] * shift(d7, two);
    add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
    CrCbY4Y3Y2Y1_b[7+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));

    dest_idx = dest_idx + eight;
  }

  // At this point each cluster has a row of data.

  dct_out << CrCbY4Y3Y2Y1_b[0];
  dct_out << CrCbY4Y3Y2Y1_b[1];
  dct_out << CrCbY4Y3Y2Y1_b[2];
  dct_out << CrCbY4Y3Y2Y1_b[3];
  dct_out << CrCbY4Y3Y2Y1_b[4];
  dct_out << CrCbY4Y3Y2Y1_b[5];
  dct_out << CrCbY4Y3Y2Y1_b[6];
  dct_out << CrCbY4Y3Y2Y1_b[7];

  dct_out << CrCbY4Y3Y2Y1_b[8];
  dct_out << CrCbY4Y3Y2Y1_b[9];
  dct_out << CrCbY4Y3Y2Y1_b[10];
  dct_out << CrCbY4Y3Y2Y1_b[11];
  dct_out << CrCbY4Y3Y2Y1_b[12];
  dct_out << CrCbY4Y3Y2Y1_b[13];
  dct_out << CrCbY4Y3Y2Y1_b[14];
  dct_out << CrCbY4Y3Y2Y1_b[15];

  dct_out << CrCbY4Y3Y2Y1_b[16];
  dct_out << CrCbY4Y3Y2Y1_b[17];
  dct_out << CrCbY4Y3Y2Y1_b[18];
  dct_out << CrCbY4Y3Y2Y1_b[19];
  dct_out << CrCbY4Y3Y2Y1_b[20];
  dct_out << CrCbY4Y3Y2Y1_b[21];
  dct_out << CrCbY4Y3Y2Y1_b[22];
  dct_out << CrCbY4Y3Y2Y1_b[23];


  // RLE

  // This code will run-level encode the data in the CrCbY4Y3Y2Y1
  // array into the rle_out array. The data is stored in half2 format in
  // the source array, with a 16-bit element from two different blocks
  // stored in each location (word) in the array. The destination array
  // will contain only data for one block in each location. The format
  // of the result data will be such that the top 16 bits contain the run
  // and the bottom 16 bits contain the level (except for the dc value,
  // which only contains the dc prediction.) So the layout of the source
  // array is Y2Y1 in the first 8 words, then Y4Y3 in the 2nd eight, etc.
  // Each of the eight clusters have 8 such elements, for a total of 64.
  // The destination array format is Y1 in the first 8 words, Y2 in the
  // second eight, Y3 in the third eight, etc.


  // The zig zag ordering code below will reorder the Y2Y1 data into words
  // 0-7 in the destination array in each cluster. Y4Y3 will go into words
  // 16-23 in the destination array, and CrCb into words 32-39. The 
  // reordering is achieved by each cluster independently sending data
  // from different locations and storing them in different locations
  // also. Using this method, eight communications are necessary to reorder
  // the data. However, the first communication is unnecessary, because it
  // only needs to reorder data within the cluster (ie, it is moving data
  // that needs to be stored in the same cluster again.) Note the zig-zag
  // code reorders the data in such a way as to transpose the data also.
  // Thus, after the zig-zag reordering, cluster 0 will have elements 0-7
  // of each block, cluster 1 will have elements 8-15, etc. This is how
  // the data is initially :
  //
  //                       Cluster
  //               0   1   2   3   4   5   6   7
  //
  //
  //         0     0   2   3   9  10  20  21  35
  //
  //         1     1   4   8  11  19  22  34  36
  //
  //         2     5   7  12  18  23  33  37  48
  //
  //  Index  3     6  13  17  24  32  38  47  49
  //
  //         4    14  16  25  31  39  46  50  57
  //
  //         5    15  26  30  40  45  51  56  58
  //
  //         6    27  29  41  44  52  55  59  62
  //
  //         7    28  42  43  53  54  60  61  63
  //
  // Thus, element 0 needs to move from location 0 in cluster 0 to location
  // 0 in cluster 0 (again). Element 1 also goes to the same logical
  // location. Element 2 needs to move from location 0 of cluster 1 to
  // location 2 of cluster 0. Element 3 needs to go from location 0 of
  // cluster 2 location 3 of cluster 0, and so on. Here are all the
  // communications necessary :
  //
  // Communication 1    Communication 2    Communication 3    Communication 4
  //  0[0] -> 0[0]       0[1] -> 0[1]       0[2] -> 0[5]       0[3] -> 0[6]
  //  1[3] -> 1[5]       1[4] -> 2[0]       1[5] -> 3[2]       1[6] -> 3[5]
  //  2[3] -> 2[1]       2[1] -> 1[0]       2[2] -> 1[4]       2[6] -> 5[1]
  //  3[3] -> 3[0]       3[4] -> 3[7]       3[2] -> 2[2]       3[0] -> 1[1]
  //  4[3] -> 4[0]       4[4] -> 4[7]       4[5] -> 5[5]       4[1] -> 2[3]
  //  5[4] -> 5[6]       5[5] -> 6[3]       5[3] -> 4[6]       5[6] -> 6[7]
  //  6[4] -> 6[2]       6[3] -> 5[7]       6[5] -> 7[0]       6[1] -> 4[2]
  //  7[4] -> 7[1]       7[5] -> 7[2]       7[2] -> 6[0]       7[6] -> 7[6]
  //
  // Communication 5    Communication 6    Communication 7    Communication 8
  //  0[4] -> 1[6]       0[5] -> 1[7]       0[6] -> 3[3]       0[7] -> 3[4]
  //  1[0] -> 0[2]       1[1] -> 0[4]       1[2] -> 0[7]       1[7] -> 5[2]
  //  2[4] -> 3[1]       2[5] -> 3[6]       2[7] -> 5[3]       2[0] -> 0[3]
  //  3[5] -> 5[0]       3[6] -> 5[4]       3[1] -> 1[3]       3[7] -> 6[5]
  //  4[2] -> 2[7]       4[6] -> 6[4]       4[7] -> 6[6]       4[0] -> 1[2]
  //  5[7] -> 7[4]       5[2] -> 4[1]       5[0] -> 2[4]       5[1] -> 2[6]
  //  6[2] -> 4[5]       6[0] -> 2[5]       6[6] -> 7[3]       6[7] -> 7[5]
  //  7[3] -> 6[1]       7[7] -> 7[7]       7[0] -> 4[3]       7[1] -> 4[4]
  //
  // The resulting indices that are required for each permutation can be
  // found in the constants file. The required permutations are :

  array<int> rle_out(48);   // the output array

  int store_idx, send_idx;

  store_idx = store_idx_arr[0];
  send_idx = send_idx_arr[0];
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -