idct_kc.cpp

来自「H.264完整的C语言代码和DCT的代码」· C++ 代码 · 共 260 行
CPP
260 行
#include "idb_kernelc.hpp"   
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"   

KERNELDEF(idct, KERNELS_DIR "idct_kc.uc");

// idct.i    -- Macroblock encode
// Ujval Kapasi
// 3/11/00
//
// 8x8 IDCT (for JPEG and MPEG)
// 
// From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui, Nakajima.
// This algorithm is based on the scaled 1D IDCT.

#define clp(val, low, high) \
          select(itocc(val > high), high, select(itocc(val < low), low, val))

kernel idct(istream<half2> datain,
            istream<uhalf2> consts,
            ostream<half2> out,
            uc<half2>& uc_quantizer_scale)
{
  // Stored in 3.13 format
  half2 B1 = 0x2d412d41;        // 1.41421  ||  1.41421
  half2 B2 = 0x539f539f;        // 2.61313  ||  2.61313
  half2 B4 = 0x22a322a3;        // 1.08239  ||  1.08239
  half2 B5 = 0x187e187e;        // 0.76537  ||  0.76537

  // Stored in (1.15)*2^-1 format
  expand<half2> K(8);
  K[0] = 0x5a825a82;        // 0.5 * (1/sqrt(2))     ||  0.5 * (1/sqrt(2))
  K[1] = 0x7d8a7d8a;        // 0.5 * (cos(1*pi/16))  ||  0.5 * (cos(1*pi/16))
  K[2] = 0x76427642;        // 0.5 * (cos(2*pi/16))  ||  0.5 * (cos(2*pi/16))
  K[3] = 0x6a6e6a6e;        // 0.5 * (cos(3*pi/16))  ||  0.5 * (cos(3*pi/16))
  K[4] = 0x5a825a82;        // 0.5 * (cos(4*pi/16))  ||  0.5 * (cos(4*pi/16))
  K[5] = 0x471d471d;        // 0.5 * (cos(5*pi/16))  ||  0.5 * (cos(5*pi/16))
  K[6] = 0x30fc30fc;        // 0.5 * (cos(6*pi/16))  ||  0.5 * (cos(6*pi/16))
  K[7] = 0x18f918f9;        // 0.5 * (cos(7*pi/16))  ||  0.5 * (cos(7*pi/16))


  half2 quant_scale = commclperm(0x8, 0, uc_quantizer_scale);
  expand<half2> quant(8);

  // this loop calculates the combined quantization factors
  // (adjust with quantizer scale)
  half2 utmp;
  consts >> utmp;
  quant[0] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[1] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[2] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[3] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[4] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[5] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[6] = lo(quant_scale*utmp);
  consts >> utmp;
  quant[7] = lo(quant_scale*utmp);

  // The permutation can be done with 8 communications. (Actually the
  // first communication is really just an inter-cluster move because
  // the diagonal doesn't change in a transpose.) Each communication
  // uses a different permutation and different send and store indices
  // for each cluster.

  // Comm permutations used to transpose the block
  uc<int> perm_a = 0x07654321;
  uc<int> perm_b = 0x10765432;
  uc<int> perm_c = 0x21076543;
  uc<int> perm_d = 0x32107654;
  uc<int> perm_e = 0x43210765;
  uc<int> perm_f = 0x54321076;
  uc<int> perm_g = 0x65432107;  

  // calculate cluster dependent send and store indices
  int idx0 = cid();
  int idx1 = (idx0 - 1) & 7;
  int idx2 = (idx1 - 1) & 7;
  int idx3 = (idx2 - 1) & 7;
  int idx4 = (idx3 - 1) & 7;
  int idx5 = (idx4 - 1) & 7;
  int idx6 = (idx5 - 1) & 7;
  int idx7 = (idx6 - 1) & 7;


  expand<half2> in(8), y(8), q(8), s(8);
  half2 a4, a5, a6, a7, b2, b3, b5, b7, c, d, e2, e4, e5, e6, f0, f1, f2;
  half2 g, h, i, j0, j1, j2, j3, j4;

  loop_stream(datain) pipeline(1) {
    datain >> in[0];
    datain >> in[1];
    datain >> in[2];
    datain >> in[3];
    datain >> in[4];
    datain >> in[5];
    datain >> in[6];
    datain >> in[7];

    // quant is actually 8 times larger
    half2 L = 0xc000c000;
    half2 H = 0x3ff83ff8;
    q[0] = clp(lo(in[0] * quant[0]), L, H);
    q[1] = clp(lo(in[1] * quant[1]), L, H);
    q[2] = clp(lo(in[2] * quant[2]), L, H);
    q[3] = clp(lo(in[3] * quant[3]), L, H);
    q[4] = clp(lo(in[4] * quant[4]), L, H);
    q[5] = clp(lo(in[5] * quant[5]), L, H);
    q[6] = clp(lo(in[6] * quant[6]), L, H);
    q[7] = clp(lo(in[7] * quant[7]), L, H);

    y[0] = hi(mulrnd(K[0], shifta(q[0], -3)));
    y[1] = hi(mulrnd(K[1], shifta(q[1], -3)));
    y[2] = hi(mulrnd(K[2], shifta(q[2], -3)));
    y[3] = hi(mulrnd(K[3], shifta(q[3], -3)));
    y[4] = hi(mulrnd(K[4], shifta(q[4], -3)));
    y[5] = hi(mulrnd(K[5], shifta(q[5], -3)));
    y[6] = hi(mulrnd(K[6], shifta(q[6], -3)));
    y[7] = hi(mulrnd(K[7], shifta(q[7], -3)));

    a4 = y[5] - y[3];
    a5 = y[1] + y[7];
    a6 = y[1] - y[7];
    a7 = y[5] + y[3];

    b2 = y[2] - y[6];
    b3 = y[2] + y[6];
    b5 = a5 - a7;
    b7 = a5 + a7;

    c = a4 - a6;
    d = hi(mulrnd(B5, shift(c, 3)));

    e2 = hi(mulrnd(B1, shift(b2, 3)));
    e4 = d - hi(mulrnd(B2, shift(a4, 3)));
    e5 = hi(mulrnd(B1, shift(b5, 3)));
    e6 = hi(mulrnd(B4, shift(a6, 3))) - d;

    f0 = y[0] + y[4];
    f1 = y[0] - y[4];
    f2 = e2 - b3;

    g = e6 - b7;
    h = e5 - g;
    i = e4 + h;

    j0 = f0 + b3;
    j1 = f1 + f2;
    j2 = f1 - f2;
    j3 = f0 - b3;
    j4 = 0-i;

    s[0] = j0 + b7;
    s[1] = j1 + g;
    s[2] = j2 + h;
    s[3] = j3 + j4;
    s[4] = j3 - j4;
    s[5] = j2 - h;
    s[6] = j1 - g;
    s[7] = j0 - b7;

    array<half2> buf1(8);  // intermediate dct output.
    array<half2> buf2(8);  // transposed intermediate dct output

    // All results in 16.0
    buf1[0] = s[0];
    buf1[1] = s[1];
    buf1[2] = s[2];
    buf1[3] = s[3];
    buf1[4] = s[4];
    buf1[5] = s[5];
    buf1[6] = s[6];
    buf1[7] = s[7];
  
    // Do comm stuff to transpose the matrix
  
    buf2[idx0] = buf1[idx0];
    buf2[idx7] = commucperm(perm_a, buf1[idx1]);
    buf2[idx6] = commucperm(perm_b, buf1[idx2]);
    buf2[idx5] = commucperm(perm_c, buf1[idx3]);
    buf2[idx4] = commucperm(perm_d, buf1[idx4]);
    buf2[idx3] = commucperm(perm_e, buf1[idx5]);
    buf2[idx2] = commucperm(perm_f, buf1[idx6]);
    buf2[idx1] = commucperm(perm_g, buf1[idx7]);

    y[0] = hi(mulrnd(K[0], buf2[0]));
    y[1] = hi(mulrnd(K[1], buf2[1]));
    y[2] = hi(mulrnd(K[2], buf2[2]));
    y[3] = hi(mulrnd(K[3], buf2[3]));
    y[4] = hi(mulrnd(K[4], buf2[4]));
    y[5] = hi(mulrnd(K[5], buf2[5]));
    y[6] = hi(mulrnd(K[6], buf2[6]));
    y[7] = hi(mulrnd(K[7], buf2[7]));

    a4 = y[5] - y[3];
    a5 = y[1] + y[7];
    a6 = y[1] - y[7];
    a7 = y[5] + y[3];

    b2 = y[2] - y[6];
    b3 = y[2] + y[6];
    b5 = a5 - a7;
    b7 = a5 + a7;

    c = a4 - a6;
    d = hi(mulrnd(B5, shift(c, 3)));

    e2 = hi(mulrnd(B1, shift(b2, 3)));
    e4 = d - hi(mulrnd(B2, shift(a4, 3)));
    e5 = hi(mulrnd(B1, shift(b5, 3)));
    e6 = hi(mulrnd(B4, shift(a6, 3))) - d;

    f0 = y[0] + y[4];
    f1 = y[0] - y[4];
    f2 = e2 - b3;

    g = e6 - b7;
    h = e5 - g;
    i = e4 + h;

    j0 = f0 + b3;
    j1 = f1 + f2;
    j2 = f1 - f2;
    j3 = f0 - b3;
    j4 = 0-i;

    s[0] = j0 + b7;
    s[1] = j1 + g;
    s[2] = j2 + h;
    s[3] = j3 + j4;
    s[4] = j3 - j4;
    s[5] = j2 - h;
    s[6] = j1 - g;
    s[7] = j0 - b7;

    s[0] = clp(s[0], 0xff00ff00, 0x00ff00ff);
    s[1] = clp(s[1], 0xff00ff00, 0x00ff00ff);
    s[2] = clp(s[2], 0xff00ff00, 0x00ff00ff);
    s[3] = clp(s[3], 0xff00ff00, 0x00ff00ff);
    s[4] = clp(s[4], 0xff00ff00, 0x00ff00ff);
    s[5] = clp(s[5], 0xff00ff00, 0x00ff00ff);
    s[6] = clp(s[6], 0xff00ff00, 0x00ff00ff);
    s[7] = clp(s[7], 0xff00ff00, 0x00ff00ff);

    out << s[0];
    out << s[1];
    out << s[2];
    out << s[3];
    out << s[4];
    out << s[5];
    out << s[6];
    out << s[7];
  }
}
idct_kc.cpp - 源码说明

本页面展示了「H.264完整的C语言代码和DCT的代码」中的 idct_kc.cpp 源码文件，采用 C++ 编程语言编写，共 260 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与264相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?