📄 mb_encode_kc.cpp
字号:
// The first four clusters get first, and the second half get second.
first = commucperm(perm_A, first);
second = commucperm(perm_B, second);
c_idx = shift(y_idx, minus_one);
y_idx = y_idx + two;
CrCbY4Y3Y2Y1_a[16+c_idx] = select(low, first, second);
}
color_out << CrCbY4Y3Y2Y1_a[0];
color_out << CrCbY4Y3Y2Y1_a[1];
color_out << CrCbY4Y3Y2Y1_a[2];
color_out << CrCbY4Y3Y2Y1_a[3];
color_out << CrCbY4Y3Y2Y1_a[4];
color_out << CrCbY4Y3Y2Y1_a[5];
color_out << CrCbY4Y3Y2Y1_a[6];
color_out << CrCbY4Y3Y2Y1_a[7];
color_out << CrCbY4Y3Y2Y1_a[8];
color_out << CrCbY4Y3Y2Y1_a[9];
color_out << CrCbY4Y3Y2Y1_a[10];
color_out << CrCbY4Y3Y2Y1_a[11];
color_out << CrCbY4Y3Y2Y1_a[12];
color_out << CrCbY4Y3Y2Y1_a[13];
color_out << CrCbY4Y3Y2Y1_a[14];
color_out << CrCbY4Y3Y2Y1_a[15];
color_out << CrCbY4Y3Y2Y1_a[16];
color_out << CrCbY4Y3Y2Y1_a[17];
color_out << CrCbY4Y3Y2Y1_a[18];
color_out << CrCbY4Y3Y2Y1_a[19];
color_out << CrCbY4Y3Y2Y1_a[20];
color_out << CrCbY4Y3Y2Y1_a[21];
color_out << CrCbY4Y3Y2Y1_a[22];
color_out << CrCbY4Y3Y2Y1_a[23];
// DCT
int eight = four + four;
sixteen = eight + eight;
half2 h2_one = 1 | half2(shift(1, sixteen));
uhalf2 uh2_half = shift(uhalf2(h2_one), sixteen - 1);
uhalf2 uh2_almost_half = uh2_half - uhalf2(h2_one);
// calculate cluster dependent send and store indices
int dest_idx = 0;
int src_idx = 0;
int idx0 = cid();
int idx1 = (idx0 - 1) & seven;
int idx2 = (idx1 - 1) & seven;
int idx3 = (idx2 - 1) & seven;
int idx4 = (idx3 - 1) & seven;
int idx5 = (idx4 - 1) & seven;
int idx6 = (idx5 - 1) & seven;
int idx7 = (idx6 - 1) & seven;
i = 3;
loop_count(i) pipeline(67) {
stage(1);
// get a's from scratchpad -- in 16.0 format
a0 = CrCbY4Y3Y2Y1_a[0+src_idx];
a1 = CrCbY4Y3Y2Y1_a[1+src_idx];
a2 = CrCbY4Y3Y2Y1_a[2+src_idx];
a3 = CrCbY4Y3Y2Y1_a[3+src_idx];
a4 = CrCbY4Y3Y2Y1_a[4+src_idx];
a5 = CrCbY4Y3Y2Y1_a[5+src_idx];
a6 = CrCbY4Y3Y2Y1_a[6+src_idx];
a7 = CrCbY4Y3Y2Y1_a[7+src_idx];
src_idx = src_idx + eight;
// do the 1d dct
half2 s16, s07, s25, s34, s1625, s0734;
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
half2 d16, d07, d25, d34, d1625, d0734;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
half2 sd16d07, sd25d34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
half2 m1_over_2, m2, m5, m6, m7, m8, m9;
// All results in 16.0
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
half2 s5, s6, s7, s8;
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
array<half2> buf1(8); // intermediate dct output. ie, do rows
array<half2> buf2(8); // then store here. Then index into this
// differently to get the columns
// All results in 16.0
buf1[0] = hi(K[0] * shift(m1_over_2, two));
buf1[1] = hi(K[1] * shift(s5 + s7, two));
buf1[2] = hi(K[2] * shift(d0734 + m5, two));
buf1[3] = hi(K[3] * shift(s6 - s8, two));
buf1[4] = hi(K[4] * shift(m2, two));
buf1[5] = hi(K[5] * shift(s6 + s8, two));
buf1[6] = hi(K[6] * shift(d0734 - m5, two));
buf1[7] = hi(K[7] * shift(s5 - s7, two));
// Do comm stuff to transpose the matrix to do rows now
buf2[idx0] = buf1[idx0];
buf2[idx7] = commucperm(perm_a, buf1[idx1]);
buf2[idx6] = commucperm(perm_b, buf1[idx2]);
buf2[idx5] = commucperm(perm_c, buf1[idx3]);
buf2[idx4] = commucperm(perm_d, buf1[idx4]);
buf2[idx3] = commucperm(perm_e, buf1[idx5]);
buf2[idx2] = commucperm(perm_f, buf1[idx6]);
buf2[idx1] = commucperm(perm_g, buf1[idx7]);
// get a's from scratchpad -- In 16.0 format
a0 = buf2[0];
a1 = buf2[1];
a2 = buf2[2];
a3 = buf2[3];
a4 = buf2[4];
a5 = buf2[5];
a6 = buf2[6];
a7 = buf2[7];
stage(2);
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
// All results in 16.0
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
d0 = m1_over_2;
d1 = s5 + s7;
d2 = d0734 + m5;
d3 = s6 - s8;
d4 = m2;
d5 = s6 + s8;
d6 = d0734 - m5;
d7 = s5 - s7;
// Round the quantized result such that 0.5 -> 1.0, and -0.5 -> -1.0.
uhalf2 round_cmp; // value to compare w/ fractional part of result
double<half2> dct_quant; // quantized dct coefficient
cc sign, add;
sign = itocc(int(d0 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[0+dest_idx] * shift(d0, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[0+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d1 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[1+dest_idx] * shift(d1, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[1+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d2 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[2+dest_idx] * shift(d2, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[2+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d3 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[3+dest_idx] * shift(d3, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[3+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d4 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[4+dest_idx] * shift(d4, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[4+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d5 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[5+dest_idx] * shift(d5, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[5+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d6 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[6+dest_idx] * shift(d6, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[6+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d7 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[7+dest_idx] * shift(d7, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[7+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
dest_idx = dest_idx + eight;
}
// At this point each cluster has a row of data.
dct_out << CrCbY4Y3Y2Y1_b[0];
dct_out << CrCbY4Y3Y2Y1_b[1];
dct_out << CrCbY4Y3Y2Y1_b[2];
dct_out << CrCbY4Y3Y2Y1_b[3];
dct_out << CrCbY4Y3Y2Y1_b[4];
dct_out << CrCbY4Y3Y2Y1_b[5];
dct_out << CrCbY4Y3Y2Y1_b[6];
dct_out << CrCbY4Y3Y2Y1_b[7];
dct_out << CrCbY4Y3Y2Y1_b[8];
dct_out << CrCbY4Y3Y2Y1_b[9];
dct_out << CrCbY4Y3Y2Y1_b[10];
dct_out << CrCbY4Y3Y2Y1_b[11];
dct_out << CrCbY4Y3Y2Y1_b[12];
dct_out << CrCbY4Y3Y2Y1_b[13];
dct_out << CrCbY4Y3Y2Y1_b[14];
dct_out << CrCbY4Y3Y2Y1_b[15];
dct_out << CrCbY4Y3Y2Y1_b[16];
dct_out << CrCbY4Y3Y2Y1_b[17];
dct_out << CrCbY4Y3Y2Y1_b[18];
dct_out << CrCbY4Y3Y2Y1_b[19];
dct_out << CrCbY4Y3Y2Y1_b[20];
dct_out << CrCbY4Y3Y2Y1_b[21];
dct_out << CrCbY4Y3Y2Y1_b[22];
dct_out << CrCbY4Y3Y2Y1_b[23];
// RLE
// This code will run-level encode the data in the CrCbY4Y3Y2Y1
// array into the rle_out array. The data is stored in half2 format in
// the source array, with a 16-bit element from two different blocks
// stored in each location (word) in the array. The destination array
// will contain only data for one block in each location. The format
// of the result data will be such that the top 16 bits contain the run
// and the bottom 16 bits contain the level (except for the dc value,
// which only contains the dc prediction.) So the layout of the source
// array is Y2Y1 in the first 8 words, then Y4Y3 in the 2nd eight, etc.
// Each of the eight clusters have 8 such elements, for a total of 64.
// The destination array format is Y1 in the first 8 words, Y2 in the
// second eight, Y3 in the third eight, etc.
// The zig zag ordering code below will reorder the Y2Y1 data into words
// 0-7 in the destination array in each cluster. Y4Y3 will go into words
// 16-23 in the destination array, and CrCb into words 32-39. The
// reordering is achieved by each cluster independently sending data
// from different locations and storing them in different locations
// also. Using this method, eight communications are necessary to reorder
// the data. However, the first communication is unnecessary, because it
// only needs to reorder data within the cluster (ie, it is moving data
// that needs to be stored in the same cluster again.) Note the zig-zag
// code reorders the data in such a way as to transpose the data also.
// Thus, after the zig-zag reordering, cluster 0 will have elements 0-7
// of each block, cluster 1 will have elements 8-15, etc. This is how
// the data is initially :
//
// Cluster
// 0 1 2 3 4 5 6 7
//
//
// 0 0 2 3 9 10 20 21 35
//
// 1 1 4 8 11 19 22 34 36
//
// 2 5 7 12 18 23 33 37 48
//
// Index 3 6 13 17 24 32 38 47 49
//
// 4 14 16 25 31 39 46 50 57
//
// 5 15 26 30 40 45 51 56 58
//
// 6 27 29 41 44 52 55 59 62
//
// 7 28 42 43 53 54 60 61 63
//
// Thus, element 0 needs to move from location 0 in cluster 0 to location
// 0 in cluster 0 (again). Element 1 also goes to the same logical
// location. Element 2 needs to move from location 0 of cluster 1 to
// location 2 of cluster 0. Element 3 needs to go from location 0 of
// cluster 2 location 3 of cluster 0, and so on. Here are all the
// communications necessary :
//
// Communication 1 Communication 2 Communication 3 Communication 4
// 0[0] -> 0[0] 0[1] -> 0[1] 0[2] -> 0[5] 0[3] -> 0[6]
// 1[3] -> 1[5] 1[4] -> 2[0] 1[5] -> 3[2] 1[6] -> 3[5]
// 2[3] -> 2[1] 2[1] -> 1[0] 2[2] -> 1[4] 2[6] -> 5[1]
// 3[3] -> 3[0] 3[4] -> 3[7] 3[2] -> 2[2] 3[0] -> 1[1]
// 4[3] -> 4[0] 4[4] -> 4[7] 4[5] -> 5[5] 4[1] -> 2[3]
// 5[4] -> 5[6] 5[5] -> 6[3] 5[3] -> 4[6] 5[6] -> 6[7]
// 6[4] -> 6[2] 6[3] -> 5[7] 6[5] -> 7[0] 6[1] -> 4[2]
// 7[4] -> 7[1] 7[5] -> 7[2] 7[2] -> 6[0] 7[6] -> 7[6]
//
// Communication 5 Communication 6 Communication 7 Communication 8
// 0[4] -> 1[6] 0[5] -> 1[7] 0[6] -> 3[3] 0[7] -> 3[4]
// 1[0] -> 0[2] 1[1] -> 0[4] 1[2] -> 0[7] 1[7] -> 5[2]
// 2[4] -> 3[1] 2[5] -> 3[6] 2[7] -> 5[3] 2[0] -> 0[3]
// 3[5] -> 5[0] 3[6] -> 5[4] 3[1] -> 1[3] 3[7] -> 6[5]
// 4[2] -> 2[7] 4[6] -> 6[4] 4[7] -> 6[6] 4[0] -> 1[2]
// 5[7] -> 7[4] 5[2] -> 4[1] 5[0] -> 2[4] 5[1] -> 2[6]
// 6[2] -> 4[5] 6[0] -> 2[5] 6[6] -> 7[3] 6[7] -> 7[5]
// 7[3] -> 6[1] 7[7] -> 7[7] 7[0] -> 4[3] 7[1] -> 4[4]
//
// The resulting indices that are required for each permutation can be
// found in the constants file. The required permutations are :
array<int> rle_out(48); // the output array
int store_idx, send_idx;
store_idx = store_idx_arr[0];
send_idx = send_idx_arr[0];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -