📄 mpeg_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(mb_encode, "mpeg_kc.uc");
// mb_encode.i -- Macroblock encode
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 2/23/99
//
// This is actually a concatenation of three kernels...
//
// 1.
// Color space warp (RGB space ---> YCrCb space) for JPEG
// Includes a 2:1 subsampling in the horizontal and vertical direction
// for Cr and Cb. These are obtained by interpolating between two pixels.
// The sampling is done as follows (4:2:0 MPEG-2 format) :
// o o o o o (the "o" represents a luminance sample,
// . . . the "." represents a chrominance sample)
// o o o o o
// . . .
// o o o o o
//
//
// Block layout : Y1 Y2 -- each block is 8x8
// Y3 Y4
//
// The input comes in such that the first row of block Y1 comes in, one
// per cluster. Then the first row of block Y2, then the second row of
// block Y1, second row of block Y2, and so on. Then after the eighth
// row of Y2, the same pattern repeats for the rows of blocks Y3 and Y4.
//
// Input : color = 0 | R || G | B // in 8.0 format
//
// Output : Y = Y2 || Y1 // in 16.0 format
// C = Cr || Cb // in 16.0 format
//
//
// NOTE : IGNORING GAMMA CORRECTION for now
//
// NOTE : eventually should be done in count up mode for generality
//
// ISTREAM 1 : constants stored in SRF and brought into clusters
//
//
//
// 2.
// 8x8 DCT (for JPEG and MPEG)
//
// From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT. Basically, the 8-pt DCT can
// be calculated by scaling the real parts of the output of the 16-pt DFT.
//
// ISTREAM 0 : constants stored in SRF and brought into clusters
//
//
//
//
// 3.
// RLE - run level encodes a macroblock for intra-coded frames.
// Encodes ac coefficients with run level encoding, and encodes the dc
// coefficients using predictive coding.
//
// ISTREAM 0 : constants stored in SRF and brought into clusters
//
kernel mb_encode(istream<byte4> datain,
istream<half2> consts,
ostream<half2> color_out,
ostream<half2> dct_out,
costream<int> out)
{
// Color constants
// Format is : 0 - 7 : 1 row/column from Y2 and Y1 blocks. Y2
// entries in high part and Y1 in low part
// of every word.
// : 6 - 15, 16 - 23 : Y4 and Y3, and Cr and Cb respectively
array<half2> CrCbY4Y3Y2Y1_a(24);
array<half2> CrCbY4Y3Y2Y1_b(24);
array<int> dc_pred(3); // for rle.i
int minus_one = 0 - 1;
int two = 1 + 1;
int four = two + two;
int sixteen = lo(four * four);
int minus_sixteen = 0 - sixteen;
// cluster : 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// perm_a : C7 | C6 | C5 | C4 | C6 | C4 | C2 | C0 |
// perm_b : C6 | C4 | C2 | C0 | C3 | C2 | C1 | C0 |
uc<int> perm_A = 0x76546420;
uc<int> perm_B = 0x64203210;
cc low = itocc(cid() < four);
cc Y_combine = itocc(shift(minus_one, sixteen));
// Y_combine = itocc(half2(0) == half2(1)); // TRUE || FALSE
// Shuffle control words
byte4 shuf_func1, shuf_func2, shuf_func3;
//shuf_func1 = 0x08020800; // zero | 3rd || zero | 1st
//shuf_func2 = 0x08080801; // zero | zero || zero | 2nd
//shuf_func3 = 0x01000100; // 1st || 1st
half2 temp;
consts >> temp;
shuf_func1 = byte4(temp);
consts >> temp;
shuf_func2 = byte4(temp);
consts >> temp;
shuf_func3 = byte4(temp);
// Luminance transform constants
half2 RB_SCALE, G_SCALE, C_SCALE;
// in 1.15 format
// RB_SCALE = 0x26460e98 // 0.299 || 0.114
// G_SCALE = 0x00004b23 // 0 || 0.587
// C_SCALE = 0x50004000 // 0.625 || 0.5
consts >> RB_SCALE >> G_SCALE >> C_SCALE;
half2 one_two_eight; // For adding by 128
// one_two_eight = 0x01000100;
consts >> one_two_eight;
// DCT constants
int three = two + 1;
int seven = four + three;
half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;
// Stored in 2.14 format
// COS_2 = 0x2d412d41; // cos(2*pi/8) || cos(2*pi/8);
// COS_3 = 0x187e187e; // cos(3*pi/8) || cos(3*pi/8);
// COS_1_plus_COS_3 = 0x539f539f; // cos(pi/8) + cos(3*pi/8) || same
// COS_1_minus_COS_3 = 0x22a322a3; // cos(pi/8) - cos(3*pi/8) || same
consts >> COS_2 >> COS_3 >> COS_1_plus_COS_3 >> COS_1_minus_COS_3;
array<half2> K(8);
// Stored in 2.14 format
// K0 = 0x16a116a1 // 0.25 * sqrt(2) || 0.25 * sqrt(2);
// K1 = 0x10501050 // 0.25 * sec(pi/16) || 0.25 * sec(pi/16);
// K2 = 0x11511151 // 0.25 * sec(2*pi/16) || 0.25 * sec(2*pi/16);
// K3 = 0x133e133e // 0.25 * sec(3*pi/16) || 0.25 * sec(3*pi/16);
// K4 = 0x16a116a1 // 0.25 * sec(4*pi/16) || 0.25 * sec(4*pi/16);
// K5 = 0x1ccd1ccd // 0.25 * sec(5*pi/16) || 0.25 * sec(5*pi/16);
// K6 = 0x29cf29cf // 0.25 * sec(6*pi/16) || 0.25 * sec(6*pi/16);
// K7 = 0x52035203 // 0.25 * sec(7*pi/16) || 0.25 * sec(7*pi/16);
consts >> K[0] >> K[1] >> K[2] >> K[3] >> K[4] >> K[5] >> K[6] >> K[7];
// The quantization factors are assumed to be stored in the SRF as
// 8/QuantFactor. This can be done without a loss of generality. These
// fractions can then be read and directly multiplied with the appropriate
// DCT coefficients, avoiding costly divides. More optimizations are also
// made. The final step of the DCT multiplies each DCT coefficient with
// a constant. This constant and the quantization coefficient are combined
// to reduce the number of multiplies which are necessary. Also, the
// division by the quantizer scale is combined with the quantization
// coefficients. (the reciprocal of the quantizer scale is what is
// actually stored in the SRF, similar to the quant factors. Also, the
// same quantization factors are used for all the luminance blocks.
// However, because of the layout of the source array (CrCbY4Y3Y2Y1),
// it makes sense to just store the quantization coefficients twice.
// Thus, locations are 0-7 in the quantization factor array are the
// luminance quantization factors, and locations 8-15 are the same thing.
// Locations 16-23 are the CrCb quantization factors.
//
// quant_scale in unsigned 1.15 (max value is 1.0)
// istream is in unsigned 1.15 (max value is 1.0)
// (quant_scale * istream) is in signed 2.14 (max value is 1.0)
// K is in 2.14 (max value is approx 1.30...)
// K * (quant_scale * istream) is in 4.12 -- max value is approx 1.30...
// -- need at least signed 2.14
// quant is in signed 2.14
uhalf2 quant_scale;
array<half2> quant(24);
consts >> temp;
quant_scale = uhalf2(temp);
int quant_idx = 0;
int k_idx = 0;
uhalf2 utmp;
// this loop calculates the combined quantization factors for luminance
// blocks.
uc<int> i = 8;
loop_count(i) unroll(1) {
consts >> temp;
utmp = uhalf2(temp); // get original quant factor
// adjust with quantizer scale
utmp = shift(hi(quant_scale * utmp), 1); // utmp in unsigned 1.15
// calculate new combined dct/quantization coefficient
quant[quant_idx] = half2(hi(utmp * uhalf2(shift(K[quant_idx], 1))));
quant_idx = quant_idx + 1;
}
// this loop just copies the above coefficients for simplicity of
// implementation later
i = 8;
int quant_idx2 = 0;
loop_count(i) unroll(1) {
quant[quant_idx] = quant[quant_idx2];
quant_idx = quant_idx + 1;
quant_idx2 = quant_idx2 + 1;
}
// this loop calculates the combined quantization factors for chrominance
// blocks.
i = 8;
k_idx = quant_idx & seven;
loop_count(i) unroll(1) {
consts >> temp;
utmp = uhalf2(temp);
utmp = shift(hi(quant_scale * utmp), 1);
quant[quant_idx] = half2(hi(utmp * uhalf2(shift(K[k_idx], 1))));
quant_idx = quant_idx + 1;
k_idx = quant_idx & seven;
}
// The permutation can be done with 8 communications. (Actually the
// first communication is really just an inter-cluster move because
// the diagonal doesn't change in a transpose.) Each communication
// uses a different permutation and different send and store indices
// for each cluster.
// Comm permutations used to transpose the block
uc<int> perm_a = 0x07654321;
uc<int> perm_b = 0x10765432;
uc<int> perm_c = 0x21076543;
uc<int> perm_d = 0x32107654;
uc<int> perm_e = 0x43210765;
uc<int> perm_f = 0x54321076;
uc<int> perm_g = 0x65432107;
// RLE CONSTANTS
// reset value for the dc predictions
int dp;
consts >> temp;
dp = int(temp);
dc_pred[0] = dp; // Y blocks
dc_pred[1] = dp; // Cb blocks
dc_pred[2] = dp; // Cr blocks
uc<int> perm_1 = 0x75643120;
uc<int> perm_2 = 0x67451320;
uc<int> perm_3 = 0x75261430;
uc<int> perm_4 = 0x57362401;
uc<int> perm_5 = 0x74352601;
uc<int> perm_6 = 0x64270531;
uc<int> perm_7 = 0x63170542;
array<int> send_idx_arr(8), store_idx_arr(8);
consts >> temp; send_idx_arr[0] = int(temp);
consts >> temp; store_idx_arr[0] = int(temp);
consts >> temp; send_idx_arr[1] = int(temp);
consts >> temp; store_idx_arr[1] = int(temp);
consts >> temp; send_idx_arr[2] = int(temp);
consts >> temp; store_idx_arr[2] = int(temp);
consts >> temp; send_idx_arr[3] = int(temp);
consts >> temp; store_idx_arr[3] = int(temp);
consts >> temp; send_idx_arr[4] = int(temp);
consts >> temp; store_idx_arr[4] = int(temp);
consts >> temp; send_idx_arr[5] = int(temp);
consts >> temp; store_idx_arr[5] = int(temp);
consts >> temp; send_idx_arr[6] = int(temp);
consts >> temp; store_idx_arr[6] = int(temp);
consts >> temp; send_idx_arr[7] = int(temp);
consts >> temp; store_idx_arr[7] = int(temp);
// Color
// loop counts
int y_idx = 0;
int c_idx;
i = 8;
byte4 color1, color2, color3, color4;
half2 a0, a1, a2, a3, a4, a5, a6, a7, a1a3, a2a4, b1, b2, b3, b4;
half2 c1, c2, c3, c4, d0, d1, d2, d3, d4, d5, d6, d7, e1, e2, e3, e4;
half2 y1, y2, y3, y4, z1, z2, z3, z4, first, second;
// each iter. does one pixel in each of two blocks for two rows (4 pixels)
loop_count(i) pipeline(1) {
stage(1);
// NOTE : IGNORING GAMMA CORRECTION for now
// The input data are in 16.0.
datain >> color1 >> color2 >> color3 >> color4;
// a = R || B
// b = 0 || G
a1 = half2(shuffle(color1, shuf_func1));
b1 = half2(shuffle(color1, shuf_func2));
a2 = half2(shuffle(color2, shuf_func1));
b2 = half2(shuffle(color2, shuf_func2));
a3 = half2(shuffle(color3, shuf_func1));
b3 = half2(shuffle(color3, shuf_func2));
a4 = half2(shuffle(color4, shuf_func1));
b4 = half2(shuffle(color4, shuf_func2));
// After shifting, the inputs are in 15.1 format. The constants are in
// 1.15 format, so the result will be in 16.0 format. The results is
// c = 0.299R || 0.114B
c1 = hi(RB_SCALE * shift(a1, 1));
c2 = hi(RB_SCALE * shift(a2, 1));
c3 = hi(RB_SCALE * shift(a3, 1));
c4 = hi(RB_SCALE * shift(a4, 1));
// d = 0 || 0.114B + 0.587G
d1 = c1 + hi(G_SCALE * shift(b1, 1));
d2 = c2 + hi(G_SCALE * shift(b2, 1));
d3 = c3 + hi(G_SCALE * shift(b3, 1));
d4 = c4 + hi(G_SCALE * shift(b4, 1));
// e = 0 || 0.299R
e1 = half2(shift(int(c1), minus_sixteen));
e2 = half2(shift(int(c2), minus_sixteen));
e3 = half2(shift(int(c3), minus_sixteen));
e4 = half2(shift(int(c4), minus_sixteen));
a1a3 = a1 + a3;
a2a4 = a2 + a4;
// y = - || Y (0.114B + 0.587G + 299R)
y1 = d1 + e1;
y2 = d2 + e2;
y3 = d3 + e3;
y4 = d4 + e4;
stage(2);
// z = Y || Y
z1 = half2(shuffle(y1, shuf_func3));
z2 = half2(shuffle(y2, shuf_func3));
z3 = half2(shuffle(y3, shuf_func3));
z4 = half2(shuffle(y4, shuf_func3));
CrCbY4Y3Y2Y1_a[0+y_idx] = select(Y_combine, z2, z1);
CrCbY4Y3Y2Y1_a[1+y_idx] = select(Y_combine, z4, z3);
// a and z are in 16.0. first and second are averages of two pixels,
// where each pixel adds the following weight to the average :
// (a-z)/1.6 + 128 || (a-z)/2 + 128. A little math is done to factor
// the multiplication (division) out of the average to reduce the number
// of necessary mutliplications and to obtain the maximum precision
// without shifts. C_SCALE is in 1.15, so the division by two is implicit,
// and no shift is necessary. The addition by 128 is also factored out.
// Each pixel has an additive factor of 128, for a total of 256, divided
// by two is 128.
first = hi((a1a3 - (z1 + z3)) * C_SCALE) + one_two_eight;
second = hi((a2a4 - (z2 + z4)) * C_SCALE) + one_two_eight;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -