📄 dct_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(dct, "mpeg_sc/dct_kc.uc");
// dct.i -- Macroblock encode
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 2/23/99
// 05/31/99
// 12/10/99
//
// 8x8 DCT (for JPEG and MPEG)
//
// From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT. Basically, the 8-pt DCT can
// be calculated by scaling the real parts of the output of the 16-pt DFT.
kernel dct(istream<half2> datain,
istream<uhalf2> consts, // in 1.15
ostream<half2> out,
uc<uhalf2>& uc_quantizer_scale) // 1/uc_quantizer_scale in 0.16
{
// Stored in 2.14 format
half2 COS_2 = 0x2d412d41; // cos(2*pi/8) || cos(2*pi/8);
half2 COS_3 = 0x187e187e; // cos(3*pi/8) || cos(3*pi/8);
half2 COS_1_plus_COS_3 = 0x539f539f; // cos(pi/8) + cos(3*pi/8) || same
half2 COS_1_minus_COS_3 = 0x22a322a3; // cos(pi/8) - cos(3*pi/8) || same
expand<half2> K(8);
// Stored in 2.14 format
K[0] = 0x16a116a1; // 0.25 * sqrt(2) || 0.25 * sqrt(2);
K[1] = 0x10501050; // 0.25 * sec(pi/16) || 0.25 * sec(pi/16);
K[2] = 0x11511151; // 0.25 * sec(2*pi/16) || 0.25 * sec(2*pi/16);
K[3] = 0x133e133e; // 0.25 * sec(3*pi/16) || 0.25 * sec(3*pi/16);
K[4] = 0x16a116a1; // 0.25 * sec(4*pi/16) || 0.25 * sec(4*pi/16);
K[5] = 0x1ccd1ccd; // 0.25 * sec(5*pi/16) || 0.25 * sec(5*pi/16);
K[6] = 0x29cf29cf; // 0.25 * sec(6*pi/16) || 0.25 * sec(6*pi/16);
K[7] = 0x52035203; // 0.25 * sec(7*pi/16) || 0.25 * sec(7*pi/16);
// The quantization factors are assumed to be stored in the SRF as
// 8/QuantFactor*Kx. This can be done without a loss of generality. These
// fractions can then be read directly avoiding costly divides.
// More optimizations are also made. The final step of the DCT multiplies
// each DCT coefficient with a constant. This constant and the
// quantization coefficient are combined to reduce the number of
// multiplies which are necessary. Also, the division by the quantizer
// scale is combined with the quantization coefficients. (the reciprocal
// of the quantizer scale is what is actually stored in the SRF, similar
// to the quant factors.)
// quant_scale max value is 1.0
// quant_coefficients max value is 1.0
// K max value is approx 1.30
// K * (quant_scale * istream) -- max value is approx 1.30...
// -- need at least unsigned 1.15, signed 2.14
synch(uc_quantizer_scale);
uhalf2 quant_scale = commclperm(0x8, 0, uc_quantizer_scale);
expand<half2> quant(8);
int quant_idx = 0;
uhalf2 utmp;
// this code calculates the combined quantization factors
// (adjusted with quantizer scale)
consts >> utmp;
quant[0] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[1] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[2] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[3] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[4] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[5] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[6] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
consts >> utmp;
quant[7] = half2(shift(hi(mulrnd(quant_scale, utmp)), -1));
// The permutation can be done with 8 communications. (Actually the
// first communication is really just an inter-cluster move because
// the diagonal doesn't change in a transpose.) Each communication
// uses a different permutation and different send and store indices
// for each cluster.
// Comm permutations used to transpose the block
uc<int> perm_a = 0x07654321;
uc<int> perm_b = 0x10765432;
uc<int> perm_c = 0x21076543;
uc<int> perm_d = 0x32107654;
uc<int> perm_e = 0x43210765;
uc<int> perm_f = 0x54321076;
uc<int> perm_g = 0x65432107;
// DCT
half2 h2_one = 1 | half2(shift(1, 16));
uhalf2 uh2_half = shift(uhalf2(h2_one), 16 - 1);
uhalf2 uh2_almost_half = uh2_half - uhalf2(h2_one);
// calculate cluster dependent send and store indices
int idx0 = cid();
int idx1 = (idx0 - 1) & 7;
int idx2 = (idx1 - 1) & 7;
int idx3 = (idx2 - 1) & 7;
int idx4 = (idx3 - 1) & 7;
int idx5 = (idx4 - 1) & 7;
int idx6 = (idx5 - 1) & 7;
int idx7 = (idx6 - 1) & 7;
loop_stream(datain) pipeline(1) {
// get a's -- in 16.0 format
half2 a0, a1, a2, a3, a4, a5, a6, a7;
datain >> a0 >> a1 >> a2 >> a3 >> a4 >> a5 >> a6 >> a7;
// do the 1d dct
half2 s16, s07, s25, s34, s1625, s0734;
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
half2 d16, d07, d25, d34, d1625, d0734;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
half2 sd16d07, sd25d34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
half2 m1_over_2, m2, m5, m6, m7, m8, m9;
// All results in 16.0
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(mulrnd(COS_2, shift(d1625 + d0734, 2)));
m6 = hi(mulrnd(COS_2, shift(d25 + d16, 2)));
m7 = hi(mulrnd(COS_3, shift(sd16d07 - sd25d34, 2)));
m8 = hi(mulrnd(COS_1_plus_COS_3, shift(sd16d07, 2)));
m9 = hi(mulrnd(COS_1_minus_COS_3, shift(sd25d34, 2)));
half2 s5, s6, s7, s8;
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
array<half2> buf1(8); // intermediate dct output.
array<half2> buf2(8); // transposed intermediate dct output
// All results in 16.0
buf1[0] = hi(mulrnd(K[0], shift(m1_over_2, 2)));
buf1[1] = hi(mulrnd(K[1], shift(s5 + s7, 2)));
buf1[2] = hi(mulrnd(K[2], shift(d0734 + m5, 2)));
buf1[3] = hi(mulrnd(K[3], shift(s6 - s8, 2)));
buf1[4] = hi(mulrnd(K[4], shift(m2, 2)));
buf1[5] = hi(mulrnd(K[5], shift(s6 + s8, 2)));
buf1[6] = hi(mulrnd(K[6], shift(d0734 - m5, 2)));
buf1[7] = hi(mulrnd(K[7], shift(s5 - s7, 2)));
// Do comm stuff to transpose the matrix to do rows now
buf2[idx0] = buf1[idx0];
buf2[idx7] = commucperm(perm_a, buf1[idx1]);
buf2[idx6] = commucperm(perm_b, buf1[idx2]);
buf2[idx5] = commucperm(perm_c, buf1[idx3]);
buf2[idx4] = commucperm(perm_d, buf1[idx4]);
buf2[idx3] = commucperm(perm_e, buf1[idx5]);
buf2[idx2] = commucperm(perm_f, buf1[idx6]);
buf2[idx1] = commucperm(perm_g, buf1[idx7]);
// get a's from scratchpad -- In 16.0 format
a0 = buf2[0];
a1 = buf2[1];
a2 = buf2[2];
a3 = buf2[3];
a4 = buf2[4];
a5 = buf2[5];
a6 = buf2[6];
a7 = buf2[7];
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
// All results in 16.0
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(mulrnd(COS_2, shift(d1625 + d0734, 2)));
m6 = hi(mulrnd(COS_2, shift(d25 + d16, 2)));
m7 = hi(mulrnd(COS_3, shift(sd16d07 - sd25d34, 2)));
m8 = hi(mulrnd(COS_1_plus_COS_3, shift(sd16d07, 2)));
m9 = hi(mulrnd(COS_1_minus_COS_3, shift(sd25d34, 2)));
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
half2 d0, d1, d2, d3, d4, d5, d6, d7;
d0 = m1_over_2;
d1 = s5 + s7;
d2 = d0734 + m5;
d3 = s6 - s8;
d4 = m2;
d5 = s6 + s8;
d6 = d0734 - m5;
d7 = s5 - s7;
// Round the quantized result such that 0.5 -> 1.0, and -0.5 -> -1.0.
uhalf2 round_cmp; // value to compare w/ fractional part of result
double<half2> dct_quant; // quantized dct coefficient
cc sign, add;
sign = itocc(int(d0 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[0] * shift(d0, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d1 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[1] * shift(d1, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d2 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[2] * shift(d2, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d3 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[3] * shift(d3, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d4 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[4] * shift(d4, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d5 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[5] * shift(d5, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d6 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[6] * shift(d6, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d7 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[7] * shift(d7, 2);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
out << select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -