⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mpeg_kc.cpp

📁 H.264完整的C语言代码和DCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "idb_kernelc.hpp"   
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"   

KERNELDEF(mb_encode, "mpeg_kc.uc");

// mb_encode.i    -- Macroblock encode
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 2/23/99
//
// This is actually a concatenation of three kernels...
//
// 1.
// Color space warp (RGB space ---> YCrCb space) for JPEG
//   Includes a 2:1 subsampling in the horizontal and vertical direction
//   for Cr and Cb. These are obtained by interpolating between two pixels.
//   The sampling is done as follows (4:2:0 MPEG-2 format) :
//      o   o   o   o   o     (the "o" represents a luminance sample,
//      .       .       .      the "." represents a chrominance sample)
//      o   o   o   o   o
//      .       .       .
//      o   o   o   o   o
//
//
// Block layout :    Y1  Y2       -- each block is 8x8
//                   Y3  Y4
//
// The input comes in such that the first row of block Y1 comes in, one
//   per cluster. Then the first row of block Y2, then the second row of
//   block Y1, second row of block Y2, and so on.  Then after the eighth
//   row of Y2, the same pattern repeats for the rows of blocks Y3 and Y4.
//
// Input  : color = 0 | R || G | B     // in 8.0 format
//
// Output :   Y   =   Y2  ||  Y1       // in 16.0 format
//            C   =   Cr  ||  Cb       // in 16.0 format
//
//
// NOTE : IGNORING GAMMA CORRECTION for now
//
// NOTE : eventually should be done in count up mode for generality
//
// ISTREAM 1 : constants stored in SRF and brought into clusters
//
//
//
// 2.
// 8x8 DCT (for JPEG and MPEG)
// 
// From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT.  Basically, the 8-pt DCT can
//   be calculated by scaling the real parts of the output of the 16-pt DFT.
//
// ISTREAM 0 : constants stored in SRF and brought into clusters
//
//
//
//
// 3.
// RLE - run level encodes a macroblock for intra-coded frames.
//   Encodes ac coefficients with run level encoding, and encodes the dc
//   coefficients using predictive coding.
//
// ISTREAM 0 : constants stored in SRF and brought into clusters
//

kernel mb_encode(istream<byte4> datain,
                 istream<half2> consts,
                 ostream<half2> color_out,
                 ostream<half2> dct_out,
                 costream<int> out)
{

  // Color constants

  // Format is  : 0 - 7 : 1 row/column from Y2 and Y1 blocks.  Y2
  //                        entries in high part and Y1 in low part
  //                        of every word.
  //            : 6 - 15, 16 - 23  : Y4 and Y3,  and  Cr and Cb respectively
  array<half2> CrCbY4Y3Y2Y1_a(24);
  array<half2> CrCbY4Y3Y2Y1_b(24);
  array<int> dc_pred(3);                // for rle.i

  int minus_one = 0 - 1;
  int two = 1 + 1;
  int four = two + two;
  int sixteen = lo(four * four);
  int minus_sixteen = 0 - sixteen;


  // cluster :  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
  // perm_a  : C7  | C6  | C5  | C4  | C6  | C4  | C2  | C0  |
  // perm_b  : C6  | C4  | C2  | C0  | C3  | C2  | C1  | C0  |
  uc<int> perm_A = 0x76546420;
  uc<int> perm_B = 0x64203210;

  cc low = itocc(cid() < four);
  cc Y_combine = itocc(shift(minus_one, sixteen));
  // Y_combine = itocc(half2(0) == half2(1));   //  TRUE  ||  FALSE


  // Shuffle control words
  byte4 shuf_func1, shuf_func2, shuf_func3;
  //shuf_func1  = 0x08020800;           // zero |  3rd || zero |  1st
  //shuf_func2  = 0x08080801;           // zero | zero || zero |  2nd
  //shuf_func3  = 0x01000100;           //         1st ||  1st
  half2 temp;
  consts >> temp;
  shuf_func1 = byte4(temp);
  consts >> temp;
  shuf_func2 = byte4(temp);
  consts >> temp;
  shuf_func3 = byte4(temp);

  // Luminance transform constants
  half2 RB_SCALE, G_SCALE, C_SCALE;
  // in 1.15 format
  // RB_SCALE = 0x26460e98          // 0.299  || 0.114
  // G_SCALE  = 0x00004b23          //   0    || 0.587
  // C_SCALE  = 0x50004000          //  0.625 ||  0.5
  consts >> RB_SCALE >> G_SCALE >> C_SCALE;

  half2 one_two_eight;     // For adding by 128
  // one_two_eight = 0x01000100;
  consts >> one_two_eight;



  // DCT constants

  int three = two + 1;
  int seven = four + three;

  half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;
  // Stored in 2.14 format
  // COS_2             = 0x2d412d41;    // cos(2*pi/8) || cos(2*pi/8);
  // COS_3             = 0x187e187e;    // cos(3*pi/8) || cos(3*pi/8);
  // COS_1_plus_COS_3  = 0x539f539f;    // cos(pi/8) + cos(3*pi/8) || same
  // COS_1_minus_COS_3 = 0x22a322a3;    // cos(pi/8) - cos(3*pi/8) || same
  consts >> COS_2 >> COS_3 >> COS_1_plus_COS_3 >> COS_1_minus_COS_3;

  array<half2> K(8);
  // Stored in 2.14 format
  // K0 = 0x16a116a1           // 0.25 * sqrt(2)       || 0.25 * sqrt(2);
  // K1 = 0x10501050           // 0.25 * sec(pi/16)    || 0.25 * sec(pi/16);
  // K2 = 0x11511151           // 0.25 * sec(2*pi/16)  || 0.25 * sec(2*pi/16);
  // K3 = 0x133e133e           // 0.25 * sec(3*pi/16)  || 0.25 * sec(3*pi/16);
  // K4 = 0x16a116a1           // 0.25 * sec(4*pi/16)  || 0.25 * sec(4*pi/16);
  // K5 = 0x1ccd1ccd           // 0.25 * sec(5*pi/16)  || 0.25 * sec(5*pi/16);
  // K6 = 0x29cf29cf           // 0.25 * sec(6*pi/16)  || 0.25 * sec(6*pi/16);
  // K7 = 0x52035203           // 0.25 * sec(7*pi/16)  || 0.25 * sec(7*pi/16);
  consts >> K[0] >> K[1] >> K[2] >> K[3] >> K[4] >> K[5] >> K[6] >> K[7];

  // The quantization factors are assumed to be stored in the SRF as
  // 8/QuantFactor. This can be done without a loss of generality. These
  // fractions can then be read and directly multiplied with the appropriate
  // DCT coefficients, avoiding costly divides. More optimizations are also
  // made. The final step of the DCT multiplies each DCT coefficient with
  // a constant. This constant and the quantization coefficient are combined
  // to reduce the number of multiplies which are necessary. Also, the
  // division by the quantizer scale is combined with the quantization
  // coefficients. (the reciprocal of the quantizer scale is what is
  // actually stored in the SRF, similar to the quant factors. Also, the
  // same quantization factors are used for all the luminance blocks.
  // However, because of the layout of the source array (CrCbY4Y3Y2Y1),
  // it makes sense to just store the quantization coefficients twice.
  // Thus, locations are 0-7 in the quantization factor array are the
  // luminance quantization factors, and locations 8-15 are the same thing.
  // Locations 16-23 are the CrCb quantization factors.
  //
  //  quant_scale in unsigned 1.15  (max value is 1.0)
  //  istream is in unsigned 1.15 (max value is 1.0)
  //  (quant_scale * istream) is in signed 2.14 (max value is 1.0)
  //  K is in 2.14  (max value is approx 1.30...)
  //  K * (quant_scale * istream) is in 4.12 -- max value is approx 1.30...
  //                                         -- need at least signed 2.14
  //  quant is in signed 2.14
  uhalf2 quant_scale;
  array<half2> quant(24);

  consts >> temp;
  quant_scale = uhalf2(temp);

  int quant_idx = 0;
  int k_idx = 0;
  uhalf2 utmp;

  // this loop calculates the combined quantization factors for luminance
  // blocks.
  uc<int> i = 8;
  loop_count(i) unroll(1) {
    consts >> temp;
    utmp = uhalf2(temp);    // get original quant factor

    // adjust with quantizer scale
    utmp = shift(hi(quant_scale * utmp), 1);    // utmp in unsigned 1.15

    // calculate new combined dct/quantization coefficient
    quant[quant_idx] = half2(hi(utmp * uhalf2(shift(K[quant_idx], 1))));

    quant_idx = quant_idx + 1;
  }

  // this loop just copies the above coefficients for simplicity of
  // implementation later
  i = 8;
  int quant_idx2 = 0;
  loop_count(i) unroll(1) {
    quant[quant_idx] = quant[quant_idx2];
    quant_idx = quant_idx + 1;
    quant_idx2 = quant_idx2 + 1;
  }

  // this loop calculates the combined quantization factors for chrominance
  // blocks.
  i = 8;
  k_idx = quant_idx & seven;  
  loop_count(i) unroll(1) {
    consts >> temp;
    utmp = uhalf2(temp);
    utmp = shift(hi(quant_scale * utmp), 1);
    quant[quant_idx] = half2(hi(utmp * uhalf2(shift(K[k_idx], 1))));

    quant_idx = quant_idx + 1;
    k_idx = quant_idx & seven;  
  }

  // The permutation can be done with 8 communications. (Actually the
  // first communication is really just an inter-cluster move because
  // the diagonal doesn't change in a transpose.) Each communication
  // uses a different permutation and different send and store indices
  // for each cluster.

  // Comm permutations used to transpose the block

  uc<int> perm_a = 0x07654321;
  uc<int> perm_b = 0x10765432;
  uc<int> perm_c = 0x21076543;
  uc<int> perm_d = 0x32107654;
  uc<int> perm_e = 0x43210765;
  uc<int> perm_f = 0x54321076;
  uc<int> perm_g = 0x65432107;


  // RLE CONSTANTS

  // reset value for the dc predictions
  int dp;
  consts >> temp;
  dp = int(temp);
  dc_pred[0] = dp;    // Y blocks
  dc_pred[1] = dp;    // Cb blocks
  dc_pred[2] = dp;    // Cr blocks

  uc<int> perm_1 = 0x75643120;
  uc<int> perm_2 = 0x67451320;
  uc<int> perm_3 = 0x75261430;
  uc<int> perm_4 = 0x57362401;
  uc<int> perm_5 = 0x74352601;
  uc<int> perm_6 = 0x64270531;
  uc<int> perm_7 = 0x63170542;

  array<int> send_idx_arr(8), store_idx_arr(8);
  consts >> temp; send_idx_arr[0] = int(temp);
  consts >> temp; store_idx_arr[0] = int(temp);
  consts >> temp; send_idx_arr[1] = int(temp);
  consts >> temp; store_idx_arr[1] = int(temp);
  consts >> temp; send_idx_arr[2] = int(temp);
  consts >> temp; store_idx_arr[2] = int(temp);
  consts >> temp; send_idx_arr[3] = int(temp);
  consts >> temp; store_idx_arr[3] = int(temp);
  consts >> temp; send_idx_arr[4] = int(temp);
  consts >> temp; store_idx_arr[4] = int(temp);
  consts >> temp; send_idx_arr[5] = int(temp);
  consts >> temp; store_idx_arr[5] = int(temp);
  consts >> temp; send_idx_arr[6] = int(temp);
  consts >> temp; store_idx_arr[6] = int(temp);
  consts >> temp; send_idx_arr[7] = int(temp);
  consts >> temp; store_idx_arr[7] = int(temp);

  // Color

  // loop counts
  int y_idx = 0;
  int c_idx;
  i = 8;

  byte4 color1, color2, color3, color4;
  half2 a0, a1, a2, a3, a4, a5, a6, a7, a1a3, a2a4, b1, b2, b3, b4;
  half2 c1, c2, c3, c4, d0, d1, d2, d3, d4, d5, d6, d7, e1, e2, e3, e4;
  half2  y1, y2, y3, y4, z1, z2, z3, z4, first, second;

  // each iter. does one pixel in each of two blocks for two rows (4 pixels)
  loop_count(i) pipeline(1) {

  stage(1);

// NOTE : IGNORING GAMMA CORRECTION for now

    // The input data are in 16.0.
    datain >> color1 >> color2 >> color3 >> color4;

    // a =    R   ||   B
    // b =    0   ||   G
    a1 = half2(shuffle(color1, shuf_func1));
    b1 = half2(shuffle(color1, shuf_func2));
    a2 = half2(shuffle(color2, shuf_func1));
    b2 = half2(shuffle(color2, shuf_func2));
    a3 = half2(shuffle(color3, shuf_func1));
    b3 = half2(shuffle(color3, shuf_func2));
    a4 = half2(shuffle(color4, shuf_func1));
    b4 = half2(shuffle(color4, shuf_func2));

    // After shifting, the inputs are in 15.1 format. The constants are in
    // 1.15 format, so the result will be in 16.0 format. The results is
    // c =  0.299R || 0.114B
    c1 = hi(RB_SCALE * shift(a1, 1));
    c2 = hi(RB_SCALE * shift(a2, 1));
    c3 = hi(RB_SCALE * shift(a3, 1));
    c4 = hi(RB_SCALE * shift(a4, 1));

    // d =     0   || 0.114B + 0.587G
    d1 = c1 + hi(G_SCALE * shift(b1, 1));
    d2 = c2 + hi(G_SCALE * shift(b2, 1));
    d3 = c3 + hi(G_SCALE * shift(b3, 1));
    d4 = c4 + hi(G_SCALE * shift(b4, 1));

    // e =     0   || 0.299R
    e1 = half2(shift(int(c1), minus_sixteen));
    e2 = half2(shift(int(c2), minus_sixteen));
    e3 = half2(shift(int(c3), minus_sixteen));
    e4 = half2(shift(int(c4), minus_sixteen));

    a1a3 = a1 + a3;
    a2a4 = a2 + a4;

    // y =    -   ||   Y (0.114B + 0.587G + 299R)
    y1 = d1 + e1;
    y2 = d2 + e2;
    y3 = d3 + e3;
    y4 = d4 + e4;

  stage(2);

    // z =    Y   ||   Y
    z1 = half2(shuffle(y1, shuf_func3));
    z2 = half2(shuffle(y2, shuf_func3));
    z3 = half2(shuffle(y3, shuf_func3));
    z4 = half2(shuffle(y4, shuf_func3));

    CrCbY4Y3Y2Y1_a[0+y_idx] = select(Y_combine, z2, z1);
    CrCbY4Y3Y2Y1_a[1+y_idx] = select(Y_combine, z4, z3);

    // a and z are in 16.0. first and second are averages of two pixels,
    // where each pixel adds the following weight to the average :
    // (a-z)/1.6 + 128 || (a-z)/2 + 128.  A little math is done to factor
    // the multiplication (division) out of the average to reduce the number
    // of necessary mutliplications and to obtain the maximum precision
    // without shifts. C_SCALE is in 1.15, so the division by two is implicit,
    // and no shift is necessary. The addition by 128 is also factored out.
    // Each pixel has an additive factor of 128, for a total of 256, divided
    // by two is 128.
    first = hi((a1a3 - (z1 + z3)) * C_SCALE) + one_two_eight;
    second = hi((a2a4 - (z2 + z4)) * C_SCALE) + one_two_eight;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -