📄 rle_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(rle, KERNELS_DIR "rle_kc.uc");
// rle.i -- Macroblock encode
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 2/23/99
// 05/31/99
// 12/10/99
//
// RLE - run level encodes a macroblock for intra-coded frames.
// Encodes ac coefficients with run level encoding, and encodes the dc
// coefficients using predictive coding.
kernel rle(istream<half2> Yin,
istream<half2> CrCbin,
istream<int> indices,
cistream<half2> motion,
costream<half2> out,
uc<uint>& pframe,
uc<uint>& quant_scale)
{
uint clzero = uint(cid() == 0);
uint pfm = commclperm(0x8, 0, pframe);
cc Iframe = itocc(clzero & ~(pfm));
cc Pframe = itocc(clzero & pfm);
cc clust_zero = itocc(clzero); // cluster 0 ?
cc ccend; // for cistream
uc<int> perm_1 = 0x75643120;
uc<int> perm_2 = 0x67451320;
uc<int> perm_3 = 0x75261430;
uc<int> perm_4 = 0x57362401;
uc<int> perm_5 = 0x74352601;
uc<int> perm_6 = 0x64270531;
uc<int> perm_7 = 0x63170542;
expand<int> store_idx(8), send_idx(8);
indices >> send_idx[0];
indices >> store_idx[0];
indices >> send_idx[1];
indices >> store_idx[1];
indices >> send_idx[2];
indices >> store_idx[2];
indices >> send_idx[3];
indices >> store_idx[3];
indices >> send_idx[4];
indices >> store_idx[4];
indices >> send_idx[5];
indices >> store_idx[5];
indices >> send_idx[6];
indices >> store_idx[6];
indices >> send_idx[7];
indices >> store_idx[7];
// Comm permutations used to transpose the block
uc<int> perm_a = 0x07654321;
uc<int> perm_b = 0x10765432;
uc<int> perm_c = 0x21076543;
uc<int> perm_d = 0x32107654;
uc<int> perm_e = 0x43210765;
uc<int> perm_f = 0x54321076;
uc<int> perm_g = 0x65432107;
// RLE
loop_stream(Yin) {
// This code will run-level encode the data in the Y4Y3, Y2,Y1 and CrCb
// arrays into the rle_out array. The data is stored in halfsixteen2 format
// in the source array, with a 16-bit element from two different blocks
// stored in each location (word) in the array. The destination array
// will contain only data for one block in each location. The format
// of the result data will be such that the top 16 bits contain the run
// and the bottom 16 bits contain the level So the layout of the source
// array is Y2Y1 in the first 8 words, then Y4Y3 in the 2nd eight, etc.
// Each of the eight clusters have 8 such elements, for a total of 64.
// The destination array format is Y1 in the first 8 words, Y2 in the
// second eight, Y3 in the third eight, etc.
send_idx[0] %= send_idx[0];
store_idx[0] %= store_idx[0];
send_idx[1] %= send_idx[1];
store_idx[1] %= store_idx[1];
send_idx[2] %= send_idx[2];
store_idx[2] %= store_idx[2];
send_idx[3] %= send_idx[3];
store_idx[3] %= store_idx[3];
send_idx[4] %= send_idx[4];
store_idx[4] %= store_idx[4];
send_idx[5] %= send_idx[5];
store_idx[5] %= store_idx[5];
send_idx[6] %= send_idx[6];
store_idx[6] %= store_idx[6];
send_idx[7] %= send_idx[7];
store_idx[7] %= store_idx[7];
array<half2> Y4Y3(8), Y2Y1(8), CrCb(8);
half2 tmp;
Yin >> tmp; Y2Y1[0] = tmp;
Yin >> tmp; Y2Y1[1] = tmp;
Yin >> tmp; Y2Y1[2] = tmp;
Yin >> tmp; Y2Y1[3] = tmp;
Yin >> tmp; Y2Y1[4] = tmp;
Yin >> tmp; Y2Y1[5] = tmp;
Yin >> tmp; Y2Y1[6] = tmp;
Yin >> tmp; Y2Y1[7] = tmp;
Yin >> tmp; Y4Y3[0] = tmp;
Yin >> tmp; Y4Y3[1] = tmp;
Yin >> tmp; Y4Y3[2] = tmp;
Yin >> tmp; Y4Y3[3] = tmp;
Yin >> tmp; Y4Y3[4] = tmp;
Yin >> tmp; Y4Y3[5] = tmp;
Yin >> tmp; Y4Y3[6] = tmp;
Yin >> tmp; Y4Y3[7] = tmp;
CrCbin >> tmp; CrCb[0] = tmp; CrCbin >> tmp; CrCb[1] = tmp;
CrCbin >> tmp; CrCb[2] = tmp; CrCbin >> tmp; CrCb[3] = tmp;
CrCbin >> tmp; CrCb[4] = tmp; CrCbin >> tmp; CrCb[5] = tmp;
CrCbin >> tmp; CrCb[6] = tmp; CrCbin >> tmp; CrCb[7] = tmp;
// The zig zag ordering code below will reorder the Y2Y1 data into words
// 0-7 in the destination array in each cluster. Y4Y3 will go into words
// 16-23 in the destination array, and CrCb into words 32-39. The
// reordering is achieved by each cluster independently sensixteending data
// from different locations and storing them in different locations
// also. Using this method, eight communications are necessary to reorder
// the data. However, the first communication is unnecessary, because it
// only needs to reorder data within the cluster (ie, it is moving data
// that needs to be stored in the same cluster again.) Note the zig-zag
// code reorders the data in such a way as to transpose the data also.
// Thus, after the zig-zag reordering, cluster 0 will have elements 0-7
// of each block, cluster 1 will have elements 8-15, etc. This is how
// the data is initially :
//
// Cluster
// 0 1 2 3 4 5 6 7
//
//
// 0 0 2 3 9 10 20 21 35
//
// 1 1 4 8 11 19 22 34 36
//
// 2 5 7 12 18 23 33 37 48
//
// Index 3 6 13 17 24 32 38 47 49
//
// 4 14 16 25 31 39 46 50 57
//
// 5 15 26 30 40 45 51 56 58
//
// 6 27 29 41 44 52 55 59 62
//
// 7 28 42 43 53 54 60 61 63
//
// Thus, element 0 needs to move from location 0 in cluster 0 to location
// 0 in cluster 0 (again). Element 1 also goes to the same logical
// location. Element 2 needs to move from location 0 of cluster 1 to
// location 2 of cluster 0. Element 3 needs to go from location 0 of
// cluster 2 location 3 of cluster 0, and so on. Here are all the
// communications necessary :
//
// Communication 1 Communication 2 Communication 3 Communication 4
// 0[0] -> 0[0] 0[1] -> 0[1] 0[2] -> 0[5] 0[3] -> 0[6]
// 1[3] -> 1[5] 1[4] -> 2[0] 1[5] -> 3[2] 1[6] -> 3[5]
// 2[3] -> 2[1] 2[1] -> 1[0] 2[2] -> 1[4] 2[6] -> 5[1]
// 3[3] -> 3[0] 3[4] -> 3[7] 3[2] -> 2[2] 3[0] -> 1[1]
// 4[3] -> 4[0] 4[4] -> 4[7] 4[5] -> 5[5] 4[1] -> 2[3]
// 5[4] -> 5[6] 5[5] -> 6[3] 5[3] -> 4[6] 5[6] -> 6[7]
// 6[4] -> 6[2] 6[3] -> 5[7] 6[5] -> 7[0] 6[1] -> 4[2]
// 7[4] -> 7[1] 7[5] -> 7[2] 7[2] -> 6[0] 7[6] -> 7[6]
//
// Communication 5 Communication 6 Communication 7 Communication 8
// 0[4] -> 1[6] 0[5] -> 1[7] 0[6] -> 3[3] 0[7] -> 3[4]
// 1[0] -> 0[2] 1[1] -> 0[4] 1[2] -> 0[7] 1[7] -> 5[2]
// 2[4] -> 3[1] 2[5] -> 3[6] 2[7] -> 5[3] 2[0] -> 0[3]
// 3[5] -> 5[0] 3[6] -> 5[4] 3[1] -> 1[3] 3[7] -> 6[5]
// 4[2] -> 2[7] 4[6] -> 6[4] 4[7] -> 6[6] 4[0] -> 1[2]
// 5[7] -> 7[4] 5[2] -> 4[1] 5[0] -> 2[4] 5[1] -> 2[6]
// 6[2] -> 4[5] 6[0] -> 2[5] 6[6] -> 7[3] 6[7] -> 7[5]
// 7[3] -> 6[1] 7[7] -> 7[7] 7[0] -> 4[3] 7[1] -> 4[4]
//
// The resulting indices that are required for each permutation can be
// found in the constants file. The required permutations are :
array<int> zigzag(24); // the output array
zigzag[0+store_idx[0]] = int(Y2Y1[send_idx[0]]);
zigzag[8+store_idx[0]] = int(Y4Y3[send_idx[0]]);
zigzag[16+store_idx[0]] = int(CrCb[send_idx[0]]);
zigzag[0+store_idx[1]] = int(commucperm(perm_1, Y2Y1[send_idx[1]]));
zigzag[8+store_idx[1]] = int(commucperm(perm_1, Y4Y3[send_idx[1]]));
zigzag[16+store_idx[1]] = int(commucperm(perm_1, CrCb[send_idx[1]]));
zigzag[0+store_idx[2]] = int(commucperm(perm_2, Y2Y1[send_idx[2]]));
zigzag[8+store_idx[2]] = int(commucperm(perm_2, Y4Y3[send_idx[2]]));
zigzag[16+store_idx[2]] = int(commucperm(perm_2, CrCb[send_idx[2]]));
zigzag[0+store_idx[3]] = int(commucperm(perm_3, Y2Y1[send_idx[3]]));
zigzag[8+store_idx[3]] = int(commucperm(perm_3, Y4Y3[send_idx[3]]));
zigzag[16+store_idx[3]] = int(commucperm(perm_3, CrCb[send_idx[3]]));
zigzag[0+store_idx[4]] = int(commucperm(perm_4, Y2Y1[send_idx[4]]));
zigzag[8+store_idx[4]] = int(commucperm(perm_4, Y4Y3[send_idx[4]]));
zigzag[16+store_idx[4]] = int(commucperm(perm_4, CrCb[send_idx[4]]));
zigzag[0+store_idx[5]] = int(commucperm(perm_5, Y2Y1[send_idx[5]]));
zigzag[8+store_idx[5]] = int(commucperm(perm_5, Y4Y3[send_idx[5]]));
zigzag[16+store_idx[5]] = int(commucperm(perm_5, CrCb[send_idx[5]]));
zigzag[0+store_idx[6]] = int(commucperm(perm_6, Y2Y1[send_idx[6]]));
zigzag[8+store_idx[6]] = int(commucperm(perm_6, Y4Y3[send_idx[6]]));
zigzag[16+store_idx[6]] = int(commucperm(perm_6, CrCb[send_idx[6]]));
zigzag[0+store_idx[7]] = int(commucperm(perm_7, Y2Y1[send_idx[7]]));
zigzag[8+store_idx[7]] = int(commucperm(perm_7, Y4Y3[send_idx[7]]));
zigzag[16+store_idx[7]] = int(commucperm(perm_7, CrCb[send_idx[7]]));
// First, each cluster independently does the run-level encoding for
// the eight values of each block that that cluster has. Later, the
// "mini-blocks" of eight will be patched up. To do the run level encoding,
// a variable called run is used to keep track of the current run. The
// final value of this variable needs to be saved and will be used to
// "patch up" the encoding for the whole block. Also, only the run-level
// pairs are recorded. All zero elements are removed. Thus, not all eight
// locations within a block hold useful data at the end of this operation.
// A zero is used to mark the first non-useful data (if there are less
// than eight actual run-level pairs). Storing only the actual run-level
// pairs speeds up the condition output stream loop later because it
// doesn't necessarily have to check all eight values, and thus the loop
// can break early if possible. Finally, the data is initially
// stored in half2 format with each 16 bit half containing an element
// from a different block. Thus, each loop iteration, each cluster run
// level encodes eight values from two different blocks at the same time.
// The first block's results overwrite the original 8 elements, and the
// second block's results will occupy the eight locations that start
// right after the source elements. Thus, for example, Y4Y3 originally
// is in locations 16-23, and the results for Y3 will be in 16-23, and
// those for Y4 will be in 24-31. (Actually, there may be less that eight
// results for each block, but the layout remains the same...)
uc<int> j, k; // loop vars
cc ac_zero; // original half2 elements == 0 ??
cc all_non_zero; // Were all 8 values in this cluster != 0 for this block
int src_idx = 0; // source data index
int data_idx = 0; // base destination data index
int lo_idx; // result data index for first block
int hi_idx; // result data index for second block
int hi_mask, lo_mask; // 16 bit masks
int run_level; // high 16 bits = run, low 16 bits = level
half2 run; // current run of zeros
half2 ac_val; // original levels (zero and non zero)
int lo_ac_val; // ac value for first block
int hi_ac_val; // ac value for second block
int run_idx = 0; // 0-5 for Y1,Y2,Y3,Y4,Cb,CR respectively
int tmp_inc;
int hi_inc; // increment for hi_idx
array<int> rle_out(96); // array holding the final run_levels
array<int> runs(12); // remainder run; used for "patching up"
// intialize rle_out with -1s
uc<int> i = 12;
loop_count(i) {
rle_out[0+data_idx] = -1;
rle_out[1+data_idx] = -1;
rle_out[2+data_idx] = -1;
rle_out[3+data_idx] = -1;
data_idx = data_idx + 4;
}
half2 h2_one = 1 | half2(shift(1, 16));
lo_mask = shift(-1, -16);
hi_mask = shift(-1, 16);
data_idx = 0;
i = 3;
loop_count(i) {
lo_idx = data_idx;
hi_idx = data_idx + 8;
// get values and check if they == 0
ac_val = half2(zigzag[src_idx]);
ac_zero = itocc(int(ac_val == 0));
// first run is 0, only set level - will be fixed by "patch up" later
lo_ac_val = int(ac_val) & lo_mask;
run_level = lo_ac_val;
rle_out[lo_idx] = run_level;
// do the same for second block
hi_ac_val = shift(int(ac_val), -16);
run_level = hi_ac_val;
rle_out[hi_idx] = run_level;
// update source data index
src_idx = src_idx + 1;
// result pointers update unconditionally if it was a dc value; else
// if it was an ac value, they only increment if the level != 0
lo_idx = lo_idx + select(Iframe, 1, select(ac_zero, 0, 1));
tmp_inc = int(select(ac_zero, half2(0), h2_one));
hi_inc = shift(tmp_inc, -16);
hi_idx = hi_idx + select(Iframe, 1, hi_inc);
// update the current run of zeros, and reset to 0 if level was non_zero
run = half2(select(Iframe, 0, int(select(ac_zero, h2_one, 0))));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -