rle_kc.cpp

来自「H.264完整的C语言代码和DCT的代码」· C++ 代码 · 共 622 行 · 第 1/2 页
CPP
622 行
#include "idb_kernelc.hpp"   
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"   

KERNELDEF(rle, KERNELS_DIR "rle_kc.uc");

// rle.i    -- Macroblock encode
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 2/23/99
// 05/31/99
// 12/10/99
//
// RLE - run level encodes a macroblock for intra-coded frames.
//   Encodes ac coefficients with run level encoding, and encodes the dc
//   coefficients using predictive coding.

kernel rle(istream<half2> Yin,
           istream<half2> CrCbin,
           istream<int> indices,
           cistream<half2> motion,
           costream<half2> out,
           uc<uint>& pframe,
           uc<uint>& quant_scale)
{
  uint clzero = uint(cid() == 0);
  uint pfm = commclperm(0x8, 0, pframe);
  cc Iframe = itocc(clzero & ~(pfm));
  cc Pframe = itocc(clzero & pfm);
  cc clust_zero = itocc(clzero);     // cluster 0 ?
  cc ccend;  // for cistream

  uc<int> perm_1 = 0x75643120;
  uc<int> perm_2 = 0x67451320;
  uc<int> perm_3 = 0x75261430;
  uc<int> perm_4 = 0x57362401;
  uc<int> perm_5 = 0x74352601;
  uc<int> perm_6 = 0x64270531;
  uc<int> perm_7 = 0x63170542;

  expand<int> store_idx(8), send_idx(8);
  indices >> send_idx[0];
  indices >> store_idx[0];
  indices >> send_idx[1];
  indices >> store_idx[1];
  indices >> send_idx[2];
  indices >> store_idx[2];
  indices >> send_idx[3];
  indices >> store_idx[3];
  indices >> send_idx[4];
  indices >> store_idx[4];
  indices >> send_idx[5];
  indices >> store_idx[5];
  indices >> send_idx[6];
  indices >> store_idx[6];
  indices >> send_idx[7];
  indices >> store_idx[7];

  // Comm permutations used to transpose the block
  uc<int> perm_a = 0x07654321;
  uc<int> perm_b = 0x10765432;
  uc<int> perm_c = 0x21076543;
  uc<int> perm_d = 0x32107654;
  uc<int> perm_e = 0x43210765;
  uc<int> perm_f = 0x54321076;
  uc<int> perm_g = 0x65432107;

  // RLE

  loop_stream(Yin) {

    // This code will run-level encode the data in the Y4Y3, Y2,Y1 and CrCb
    // arrays into the rle_out array. The data is stored in halfsixteen2 format
    // in the source array, with a 16-bit element from two different blocks
    // stored in each location (word) in the array. The destination array
    // will contain only data for one block in each location. The format
    // of the result data will be such that the top 16 bits contain the run
    // and the bottom 16 bits contain the level So the layout of the source
    // array is Y2Y1 in the first 8 words, then Y4Y3 in the 2nd eight, etc.
    // Each of the eight clusters have 8 such elements, for a total of 64.
    // The destination array format is Y1 in the first 8 words, Y2 in the
    // second eight, Y3 in the third eight, etc.

    send_idx[0]  %= send_idx[0];
    store_idx[0] %= store_idx[0];
    send_idx[1]  %= send_idx[1];
    store_idx[1] %= store_idx[1];
    send_idx[2]  %= send_idx[2];
    store_idx[2] %= store_idx[2];
    send_idx[3]  %= send_idx[3];
    store_idx[3] %= store_idx[3];
    send_idx[4]  %= send_idx[4];
    store_idx[4] %= store_idx[4];
    send_idx[5]  %= send_idx[5];
    store_idx[5] %= store_idx[5];
    send_idx[6]  %= send_idx[6];
    store_idx[6] %= store_idx[6];
    send_idx[7]  %= send_idx[7];
    store_idx[7] %= store_idx[7];

    array<half2> Y4Y3(8), Y2Y1(8), CrCb(8);
    half2 tmp;
    Yin >> tmp; Y2Y1[0] = tmp; 
    Yin >> tmp; Y2Y1[1] = tmp;
    Yin >> tmp; Y2Y1[2] = tmp; 
    Yin >> tmp; Y2Y1[3] = tmp;
    Yin >> tmp; Y2Y1[4] = tmp; 
    Yin >> tmp; Y2Y1[5] = tmp;
    Yin >> tmp; Y2Y1[6] = tmp; 
    Yin >> tmp; Y2Y1[7] = tmp;
    Yin >> tmp; Y4Y3[0] = tmp; 
    Yin >> tmp; Y4Y3[1] = tmp;
    Yin >> tmp; Y4Y3[2] = tmp; 
    Yin >> tmp; Y4Y3[3] = tmp;
    Yin >> tmp; Y4Y3[4] = tmp; 
    Yin >> tmp; Y4Y3[5] = tmp;
    Yin >> tmp; Y4Y3[6] = tmp; 
    Yin >> tmp; Y4Y3[7] = tmp;
    CrCbin >> tmp; CrCb[0] = tmp; CrCbin >> tmp; CrCb[1] = tmp;
    CrCbin >> tmp; CrCb[2] = tmp; CrCbin >> tmp; CrCb[3] = tmp;
    CrCbin >> tmp; CrCb[4] = tmp; CrCbin >> tmp; CrCb[5] = tmp;
    CrCbin >> tmp; CrCb[6] = tmp; CrCbin >> tmp; CrCb[7] = tmp;

    // The zig zag ordering code below will reorder the Y2Y1 data into words
    // 0-7 in the destination array in each cluster. Y4Y3 will go into words
    // 16-23 in the destination array, and CrCb into words 32-39. The 
    // reordering is achieved by each cluster independently sensixteending data
    // from different locations and storing them in different locations
    // also. Using this method, eight communications are necessary to reorder
    // the data. However, the first communication is unnecessary, because it
    // only needs to reorder data within the cluster (ie, it is moving data
    // that needs to be stored in the same cluster again.) Note the zig-zag
    // code reorders the data in such a way as to transpose the data also.
    // Thus, after the zig-zag reordering, cluster 0 will have elements 0-7
    // of each block, cluster 1 will have elements 8-15, etc. This is how
    // the data is initially :
    //
    //                       Cluster
    //               0   1   2   3   4   5   6   7
    //
    //
    //         0     0   2   3   9  10  20  21  35
    //
    //         1     1   4   8  11  19  22  34  36
    //
    //         2     5   7  12  18  23  33  37  48
    //
    //  Index  3     6  13  17  24  32  38  47  49
    //
    //         4    14  16  25  31  39  46  50  57
    //
    //         5    15  26  30  40  45  51  56  58
    //
    //         6    27  29  41  44  52  55  59  62
    //
    //         7    28  42  43  53  54  60  61  63
    //
    // Thus, element 0 needs to move from location 0 in cluster 0 to location
    // 0 in cluster 0 (again). Element 1 also goes to the same logical
    // location. Element 2 needs to move from location 0 of cluster 1 to
    // location 2 of cluster 0. Element 3 needs to go from location 0 of
    // cluster 2 location 3 of cluster 0, and so on. Here are all the
    // communications necessary :
    //
    // Communication 1    Communication 2    Communication 3    Communication 4
    //  0[0] -> 0[0]       0[1] -> 0[1]       0[2] -> 0[5]       0[3] -> 0[6]
    //  1[3] -> 1[5]       1[4] -> 2[0]       1[5] -> 3[2]       1[6] -> 3[5]
    //  2[3] -> 2[1]       2[1] -> 1[0]       2[2] -> 1[4]       2[6] -> 5[1]
    //  3[3] -> 3[0]       3[4] -> 3[7]       3[2] -> 2[2]       3[0] -> 1[1]
    //  4[3] -> 4[0]       4[4] -> 4[7]       4[5] -> 5[5]       4[1] -> 2[3]
    //  5[4] -> 5[6]       5[5] -> 6[3]       5[3] -> 4[6]       5[6] -> 6[7]
    //  6[4] -> 6[2]       6[3] -> 5[7]       6[5] -> 7[0]       6[1] -> 4[2]
    //  7[4] -> 7[1]       7[5] -> 7[2]       7[2] -> 6[0]       7[6] -> 7[6]
    //
    // Communication 5    Communication 6    Communication 7    Communication 8
    //  0[4] -> 1[6]       0[5] -> 1[7]       0[6] -> 3[3]       0[7] -> 3[4]
    //  1[0] -> 0[2]       1[1] -> 0[4]       1[2] -> 0[7]       1[7] -> 5[2]
    //  2[4] -> 3[1]       2[5] -> 3[6]       2[7] -> 5[3]       2[0] -> 0[3]
    //  3[5] -> 5[0]       3[6] -> 5[4]       3[1] -> 1[3]       3[7] -> 6[5]
    //  4[2] -> 2[7]       4[6] -> 6[4]       4[7] -> 6[6]       4[0] -> 1[2]
    //  5[7] -> 7[4]       5[2] -> 4[1]       5[0] -> 2[4]       5[1] -> 2[6]
    //  6[2] -> 4[5]       6[0] -> 2[5]       6[6] -> 7[3]       6[7] -> 7[5]
    //  7[3] -> 6[1]       7[7] -> 7[7]       7[0] -> 4[3]       7[1] -> 4[4]
    //
    // The resulting indices that are required for each permutation can be
    // found in the constants file. The required permutations are :

    array<int> zigzag(24);   // the output array

    zigzag[0+store_idx[0]] = int(Y2Y1[send_idx[0]]);
    zigzag[8+store_idx[0]] = int(Y4Y3[send_idx[0]]);
    zigzag[16+store_idx[0]] = int(CrCb[send_idx[0]]);

    zigzag[0+store_idx[1]] =  int(commucperm(perm_1, Y2Y1[send_idx[1]]));
    zigzag[8+store_idx[1]] = int(commucperm(perm_1, Y4Y3[send_idx[1]]));
    zigzag[16+store_idx[1]] = int(commucperm(perm_1, CrCb[send_idx[1]]));

    zigzag[0+store_idx[2]] =  int(commucperm(perm_2, Y2Y1[send_idx[2]]));
    zigzag[8+store_idx[2]] = int(commucperm(perm_2, Y4Y3[send_idx[2]]));
    zigzag[16+store_idx[2]] = int(commucperm(perm_2, CrCb[send_idx[2]]));

    zigzag[0+store_idx[3]] =  int(commucperm(perm_3, Y2Y1[send_idx[3]]));
    zigzag[8+store_idx[3]] = int(commucperm(perm_3, Y4Y3[send_idx[3]]));
    zigzag[16+store_idx[3]] = int(commucperm(perm_3, CrCb[send_idx[3]]));

    zigzag[0+store_idx[4]] =  int(commucperm(perm_4, Y2Y1[send_idx[4]]));
    zigzag[8+store_idx[4]] = int(commucperm(perm_4, Y4Y3[send_idx[4]]));
    zigzag[16+store_idx[4]] = int(commucperm(perm_4, CrCb[send_idx[4]]));

    zigzag[0+store_idx[5]] =  int(commucperm(perm_5, Y2Y1[send_idx[5]]));
    zigzag[8+store_idx[5]] = int(commucperm(perm_5, Y4Y3[send_idx[5]]));
    zigzag[16+store_idx[5]] = int(commucperm(perm_5, CrCb[send_idx[5]]));

    zigzag[0+store_idx[6]] =  int(commucperm(perm_6, Y2Y1[send_idx[6]]));
    zigzag[8+store_idx[6]] = int(commucperm(perm_6, Y4Y3[send_idx[6]]));
    zigzag[16+store_idx[6]] = int(commucperm(perm_6, CrCb[send_idx[6]]));

    zigzag[0+store_idx[7]] =  int(commucperm(perm_7, Y2Y1[send_idx[7]]));
    zigzag[8+store_idx[7]] = int(commucperm(perm_7, Y4Y3[send_idx[7]]));
    zigzag[16+store_idx[7]] = int(commucperm(perm_7, CrCb[send_idx[7]]));


    // First, each cluster independently does the run-level encoding for
    // the eight values of each block that that cluster has. Later, the
    // "mini-blocks" of eight will be patched up. To do the run level encoding,
    // a variable called run is used to keep track of the current run. The
    // final value of this variable needs to be saved and will be used to
    // "patch up" the encoding for the whole block. Also, only the run-level
    // pairs are recorded. All zero elements are removed. Thus, not all eight
    // locations within a block hold useful data at the end of this operation.
    // A zero is used to mark the first non-useful data (if there are less
    // than eight actual run-level pairs). Storing only the actual run-level
    // pairs speeds up the condition output stream loop later because it
    // doesn't necessarily have to check all eight values, and thus the loop
    // can break early if possible. Finally, the data is initially
    // stored in half2 format with each 16 bit half containing an element
    // from a different block. Thus, each loop iteration, each cluster run
    // level encodes eight values from two different blocks at the same time.
    // The first block's results overwrite the original 8 elements, and the
    // second block's results will occupy the eight locations that start
    // right after the source elements. Thus, for example, Y4Y3 originally
    // is in locations 16-23, and the results for Y3 will be in 16-23, and
    // those for Y4 will be in 24-31. (Actually, there may be less that eight
    // results for each block, but the layout remains the same...)

    uc<int> j, k;    // loop vars
    cc ac_zero;        // original half2 elements == 0 ??
    cc all_non_zero;   // Were all 8 values in this cluster != 0 for this block
    int src_idx = 0;   // source data index
    int data_idx = 0;  // base destination data index
    int lo_idx;        // result data index for first block
    int hi_idx;        // result data index for second block
    int hi_mask, lo_mask;  // 16 bit masks
    int run_level;     // high 16 bits = run, low 16 bits = level
    half2 run;         // current run of zeros
    half2 ac_val;      // original levels (zero and non zero)
    int lo_ac_val;     // ac value for first block
    int hi_ac_val;     // ac value for second block
    int run_idx = 0;   // 0-5 for Y1,Y2,Y3,Y4,Cb,CR respectively
    int tmp_inc;
    int hi_inc;        // increment for hi_idx

    array<int> rle_out(96); // array holding the final run_levels
    array<int> runs(12);    // remainder run; used for "patching up"

    // intialize rle_out with -1s
    uc<int> i = 12;
    loop_count(i) {
      rle_out[0+data_idx] = -1;
      rle_out[1+data_idx] = -1;
      rle_out[2+data_idx] = -1;
      rle_out[3+data_idx] = -1;
      data_idx = data_idx + 4;
    }

    half2 h2_one = 1 | half2(shift(1, 16));
    lo_mask = shift(-1, -16);
    hi_mask = shift(-1, 16);
    data_idx = 0;
    i = 3;
    loop_count(i) {
      lo_idx = data_idx;
      hi_idx = data_idx + 8;

      // get values and check if they == 0
      ac_val = half2(zigzag[src_idx]);
      ac_zero = itocc(int(ac_val == 0));

      // first run is 0, only set level - will be fixed by "patch up" later
      lo_ac_val = int(ac_val) & lo_mask;
      run_level = lo_ac_val;
      rle_out[lo_idx] = run_level;

      // do the same for second block
      hi_ac_val = shift(int(ac_val), -16);
      run_level = hi_ac_val;
      rle_out[hi_idx] = run_level;

      // update source data index
      src_idx = src_idx + 1;

      // result pointers update unconditionally if it was a dc value; else
      // if it was an ac value, they only increment if the level != 0
      lo_idx = lo_idx + select(Iframe, 1, select(ac_zero, 0, 1));
      tmp_inc = int(select(ac_zero, half2(0), h2_one));
      hi_inc = shift(tmp_inc, -16);
      hi_idx = hi_idx + select(Iframe, 1, hi_inc);

      // update the current run of zeros, and reset to 0 if level was non_zero
      run = half2(select(Iframe, 0, int(select(ac_zero, h2_one, 0))));
rle_kc.cpp - 源码说明

本页面展示了「H.264完整的C语言代码和DCT的代码」中的 rle_kc.cpp 源码文件，采用 C++ 编程语言编写，共 622 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与264相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?