📄 idct_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(idct, KERNELS_DIR "idct_kc.uc");
// idct.i -- Macroblock encode
// Ujval Kapasi
// 3/11/00
//
// 8x8 IDCT (for JPEG and MPEG)
//
// From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.
// This algorithm is based on the scaled 1D IDCT.
#define clp(val, low, high) \
select(itocc(val > high), high, select(itocc(val < low), low, val))
kernel idct(istream<half2> datain,
istream<uhalf2> consts,
ostream<half2> out,
uc<half2>& uc_quantizer_scale)
{
// Stored in 3.13 format
half2 B1 = 0x2d412d41; // 1.41421 || 1.41421
half2 B2 = 0x539f539f; // 2.61313 || 2.61313
half2 B4 = 0x22a322a3; // 1.08239 || 1.08239
half2 B5 = 0x187e187e; // 0.76537 || 0.76537
// Stored in (1.15)*2^-1 format
expand<half2> K(8);
K[0] = 0x5a825a82; // 0.5 * (1/sqrt(2)) || 0.5 * (1/sqrt(2))
K[1] = 0x7d8a7d8a; // 0.5 * (cos(1*pi/16)) || 0.5 * (cos(1*pi/16))
K[2] = 0x76427642; // 0.5 * (cos(2*pi/16)) || 0.5 * (cos(2*pi/16))
K[3] = 0x6a6e6a6e; // 0.5 * (cos(3*pi/16)) || 0.5 * (cos(3*pi/16))
K[4] = 0x5a825a82; // 0.5 * (cos(4*pi/16)) || 0.5 * (cos(4*pi/16))
K[5] = 0x471d471d; // 0.5 * (cos(5*pi/16)) || 0.5 * (cos(5*pi/16))
K[6] = 0x30fc30fc; // 0.5 * (cos(6*pi/16)) || 0.5 * (cos(6*pi/16))
K[7] = 0x18f918f9; // 0.5 * (cos(7*pi/16)) || 0.5 * (cos(7*pi/16))
half2 quant_scale = commclperm(0x8, 0, uc_quantizer_scale);
expand<half2> quant(8);
// this loop calculates the combined quantization factors
// (adjust with quantizer scale)
half2 utmp;
consts >> utmp;
quant[0] = lo(quant_scale*utmp);
consts >> utmp;
quant[1] = lo(quant_scale*utmp);
consts >> utmp;
quant[2] = lo(quant_scale*utmp);
consts >> utmp;
quant[3] = lo(quant_scale*utmp);
consts >> utmp;
quant[4] = lo(quant_scale*utmp);
consts >> utmp;
quant[5] = lo(quant_scale*utmp);
consts >> utmp;
quant[6] = lo(quant_scale*utmp);
consts >> utmp;
quant[7] = lo(quant_scale*utmp);
// The permutation can be done with 8 communications. (Actually the
// first communication is really just an inter-cluster move because
// the diagonal doesn't change in a transpose.) Each communication
// uses a different permutation and different send and store indices
// for each cluster.
// Comm permutations used to transpose the block
uc<int> perm_a = 0x07654321;
uc<int> perm_b = 0x10765432;
uc<int> perm_c = 0x21076543;
uc<int> perm_d = 0x32107654;
uc<int> perm_e = 0x43210765;
uc<int> perm_f = 0x54321076;
uc<int> perm_g = 0x65432107;
// calculate cluster dependent send and store indices
int idx0 = cid();
int idx1 = (idx0 - 1) & 7;
int idx2 = (idx1 - 1) & 7;
int idx3 = (idx2 - 1) & 7;
int idx4 = (idx3 - 1) & 7;
int idx5 = (idx4 - 1) & 7;
int idx6 = (idx5 - 1) & 7;
int idx7 = (idx6 - 1) & 7;
expand<half2> in(8), y(8), q(8), s(8);
half2 a4, a5, a6, a7, b2, b3, b5, b7, c, d, e2, e4, e5, e6, f0, f1, f2;
half2 g, h, i, j0, j1, j2, j3, j4;
loop_stream(datain) pipeline(1) {
datain >> in[0];
datain >> in[1];
datain >> in[2];
datain >> in[3];
datain >> in[4];
datain >> in[5];
datain >> in[6];
datain >> in[7];
// quant is actually 8 times larger
half2 L = 0xc000c000;
half2 H = 0x3ff83ff8;
q[0] = clp(lo(in[0] * quant[0]), L, H);
q[1] = clp(lo(in[1] * quant[1]), L, H);
q[2] = clp(lo(in[2] * quant[2]), L, H);
q[3] = clp(lo(in[3] * quant[3]), L, H);
q[4] = clp(lo(in[4] * quant[4]), L, H);
q[5] = clp(lo(in[5] * quant[5]), L, H);
q[6] = clp(lo(in[6] * quant[6]), L, H);
q[7] = clp(lo(in[7] * quant[7]), L, H);
y[0] = hi(mulrnd(K[0], shifta(q[0], -3)));
y[1] = hi(mulrnd(K[1], shifta(q[1], -3)));
y[2] = hi(mulrnd(K[2], shifta(q[2], -3)));
y[3] = hi(mulrnd(K[3], shifta(q[3], -3)));
y[4] = hi(mulrnd(K[4], shifta(q[4], -3)));
y[5] = hi(mulrnd(K[5], shifta(q[5], -3)));
y[6] = hi(mulrnd(K[6], shifta(q[6], -3)));
y[7] = hi(mulrnd(K[7], shifta(q[7], -3)));
a4 = y[5] - y[3];
a5 = y[1] + y[7];
a6 = y[1] - y[7];
a7 = y[5] + y[3];
b2 = y[2] - y[6];
b3 = y[2] + y[6];
b5 = a5 - a7;
b7 = a5 + a7;
c = a4 - a6;
d = hi(mulrnd(B5, shift(c, 3)));
e2 = hi(mulrnd(B1, shift(b2, 3)));
e4 = d - hi(mulrnd(B2, shift(a4, 3)));
e5 = hi(mulrnd(B1, shift(b5, 3)));
e6 = hi(mulrnd(B4, shift(a6, 3))) - d;
f0 = y[0] + y[4];
f1 = y[0] - y[4];
f2 = e2 - b3;
g = e6 - b7;
h = e5 - g;
i = e4 + h;
j0 = f0 + b3;
j1 = f1 + f2;
j2 = f1 - f2;
j3 = f0 - b3;
j4 = 0-i;
s[0] = j0 + b7;
s[1] = j1 + g;
s[2] = j2 + h;
s[3] = j3 + j4;
s[4] = j3 - j4;
s[5] = j2 - h;
s[6] = j1 - g;
s[7] = j0 - b7;
array<half2> buf1(8); // intermediate dct output.
array<half2> buf2(8); // transposed intermediate dct output
// All results in 16.0
buf1[0] = s[0];
buf1[1] = s[1];
buf1[2] = s[2];
buf1[3] = s[3];
buf1[4] = s[4];
buf1[5] = s[5];
buf1[6] = s[6];
buf1[7] = s[7];
// Do comm stuff to transpose the matrix
buf2[idx0] = buf1[idx0];
buf2[idx7] = commucperm(perm_a, buf1[idx1]);
buf2[idx6] = commucperm(perm_b, buf1[idx2]);
buf2[idx5] = commucperm(perm_c, buf1[idx3]);
buf2[idx4] = commucperm(perm_d, buf1[idx4]);
buf2[idx3] = commucperm(perm_e, buf1[idx5]);
buf2[idx2] = commucperm(perm_f, buf1[idx6]);
buf2[idx1] = commucperm(perm_g, buf1[idx7]);
y[0] = hi(mulrnd(K[0], buf2[0]));
y[1] = hi(mulrnd(K[1], buf2[1]));
y[2] = hi(mulrnd(K[2], buf2[2]));
y[3] = hi(mulrnd(K[3], buf2[3]));
y[4] = hi(mulrnd(K[4], buf2[4]));
y[5] = hi(mulrnd(K[5], buf2[5]));
y[6] = hi(mulrnd(K[6], buf2[6]));
y[7] = hi(mulrnd(K[7], buf2[7]));
a4 = y[5] - y[3];
a5 = y[1] + y[7];
a6 = y[1] - y[7];
a7 = y[5] + y[3];
b2 = y[2] - y[6];
b3 = y[2] + y[6];
b5 = a5 - a7;
b7 = a5 + a7;
c = a4 - a6;
d = hi(mulrnd(B5, shift(c, 3)));
e2 = hi(mulrnd(B1, shift(b2, 3)));
e4 = d - hi(mulrnd(B2, shift(a4, 3)));
e5 = hi(mulrnd(B1, shift(b5, 3)));
e6 = hi(mulrnd(B4, shift(a6, 3))) - d;
f0 = y[0] + y[4];
f1 = y[0] - y[4];
f2 = e2 - b3;
g = e6 - b7;
h = e5 - g;
i = e4 + h;
j0 = f0 + b3;
j1 = f1 + f2;
j2 = f1 - f2;
j3 = f0 - b3;
j4 = 0-i;
s[0] = j0 + b7;
s[1] = j1 + g;
s[2] = j2 + h;
s[3] = j3 + j4;
s[4] = j3 - j4;
s[5] = j2 - h;
s[6] = j1 - g;
s[7] = j0 - b7;
s[0] = clp(s[0], 0xff00ff00, 0x00ff00ff);
s[1] = clp(s[1], 0xff00ff00, 0x00ff00ff);
s[2] = clp(s[2], 0xff00ff00, 0x00ff00ff);
s[3] = clp(s[3], 0xff00ff00, 0x00ff00ff);
s[4] = clp(s[4], 0xff00ff00, 0x00ff00ff);
s[5] = clp(s[5], 0xff00ff00, 0x00ff00ff);
s[6] = clp(s[6], 0xff00ff00, 0x00ff00ff);
s[7] = clp(s[7], 0xff00ff00, 0x00ff00ff);
out << s[0];
out << s[1];
out << s[2];
out << s[3];
out << s[4];
out << s[5];
out << s[6];
out << s[7];
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -