📄 mpeg_kc.i
字号:
half2 d16, d07, d25, d34, d1625, d0734;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
half2 sd16d07, sd25d34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
half2 m1_over_2, m2, m5, m6, m7, m8, m9;
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
half2 s5, s6, s7, s8;
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
array<half2> buf1(8);
array<half2> buf2(8);
buf1[0] = hi(K[0] * shift(m1_over_2, two));
buf1[1] = hi(K[1] * shift(s5 + s7, two));
buf1[2] = hi(K[2] * shift(d0734 + m5, two));
buf1[3] = hi(K[3] * shift(s6 - s8, two));
buf1[4] = hi(K[4] * shift(m2, two));
buf1[5] = hi(K[5] * shift(s6 + s8, two));
buf1[6] = hi(K[6] * shift(d0734 - m5, two));
buf1[7] = hi(K[7] * shift(s5 - s7, two));
buf2[idx0] = buf1[idx0];
buf2[idx7] = commucperm(perm_a, buf1[idx1]);
buf2[idx6] = commucperm(perm_b, buf1[idx2]);
buf2[idx5] = commucperm(perm_c, buf1[idx3]);
buf2[idx4] = commucperm(perm_d, buf1[idx4]);
buf2[idx3] = commucperm(perm_e, buf1[idx5]);
buf2[idx2] = commucperm(perm_f, buf1[idx6]);
buf2[idx1] = commucperm(perm_g, buf1[idx7]);
a0 = buf2[0];
a1 = buf2[1];
a2 = buf2[2];
a3 = buf2[3];
a4 = buf2[4];
a5 = buf2[5];
a6 = buf2[6];
a7 = buf2[7];
stage(2);
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
d0 = m1_over_2;
d1 = s5 + s7;
d2 = d0734 + m5;
d3 = s6 - s8;
d4 = m2;
d5 = s6 + s8;
d6 = d0734 - m5;
d7 = s5 - s7;
uhalf2 round_cmp;
double<half2> dct_quant;
cc sign, add;
sign = itocc(int(d0 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[0+dest_idx] * shift(d0, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[0+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d1 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[1+dest_idx] * shift(d1, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[1+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d2 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[2+dest_idx] * shift(d2, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[2+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d3 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[3+dest_idx] * shift(d3, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[3+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d4 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[4+dest_idx] * shift(d4, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[4+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d5 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[5+dest_idx] * shift(d5, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[5+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d6 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[6+dest_idx] * shift(d6, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[6+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
sign = itocc(int(d7 <= 0));
round_cmp = select(sign, uh2_half, uh2_almost_half);
dct_quant = quant[7+dest_idx] * shift(d7, two);
add = itocc(int(round_cmp < uhalf2(lo(dct_quant))));
CrCbY4Y3Y2Y1_b[7+dest_idx] = select(add, (hi(dct_quant)+h2_one), hi(dct_quant));
dest_idx = dest_idx + eight;
}
dct_out << CrCbY4Y3Y2Y1_b[0];
dct_out << CrCbY4Y3Y2Y1_b[1];
dct_out << CrCbY4Y3Y2Y1_b[2];
dct_out << CrCbY4Y3Y2Y1_b[3];
dct_out << CrCbY4Y3Y2Y1_b[4];
dct_out << CrCbY4Y3Y2Y1_b[5];
dct_out << CrCbY4Y3Y2Y1_b[6];
dct_out << CrCbY4Y3Y2Y1_b[7];
dct_out << CrCbY4Y3Y2Y1_b[8];
dct_out << CrCbY4Y3Y2Y1_b[9];
dct_out << CrCbY4Y3Y2Y1_b[10];
dct_out << CrCbY4Y3Y2Y1_b[11];
dct_out << CrCbY4Y3Y2Y1_b[12];
dct_out << CrCbY4Y3Y2Y1_b[13];
dct_out << CrCbY4Y3Y2Y1_b[14];
dct_out << CrCbY4Y3Y2Y1_b[15];
dct_out << CrCbY4Y3Y2Y1_b[16];
dct_out << CrCbY4Y3Y2Y1_b[17];
dct_out << CrCbY4Y3Y2Y1_b[18];
dct_out << CrCbY4Y3Y2Y1_b[19];
dct_out << CrCbY4Y3Y2Y1_b[20];
dct_out << CrCbY4Y3Y2Y1_b[21];
dct_out << CrCbY4Y3Y2Y1_b[22];
dct_out << CrCbY4Y3Y2Y1_b[23];
array<int> rle_out(48);
int store_idx, send_idx;
store_idx = store_idx_arr[0];
send_idx = send_idx_arr[0];
rle_out[0+store_idx] = int(CrCbY4Y3Y2Y1_b[0+send_idx]);
rle_out[16+store_idx] = int(CrCbY4Y3Y2Y1_b[8+send_idx]);
rle_out[32+store_idx] = int(CrCbY4Y3Y2Y1_b[16+send_idx]);
store_idx = store_idx_arr[1];
send_idx = send_idx_arr[1];
rle_out[0+store_idx] = int(commucperm(perm_1, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[2];
send_idx = send_idx_arr[2];
rle_out[0+store_idx] = int(commucperm(perm_2, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[3];
send_idx = send_idx_arr[3];
rle_out[0+store_idx] = int(commucperm(perm_3, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[16+send_idx]));
i = 0;
loop_count(i) {
int g = 0;
}
store_idx = store_idx_arr[4];
send_idx = send_idx_arr[4];
rle_out[0+store_idx] = int(commucperm(perm_4, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[5];
send_idx = send_idx_arr[5];
rle_out[0+store_idx] = int(commucperm(perm_5, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[6];
send_idx = send_idx_arr[6];
rle_out[0+store_idx] = int(commucperm(perm_6, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[7];
send_idx = send_idx_arr[7];
rle_out[0+store_idx] = int(commucperm(perm_7, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[16+send_idx]));
uc<int> j, k;
cc clust_zero = itocc(cid() == 0);
cc ac_zero;
cc luminance;
cc all_non_zero;
int data_idx = 0;
int lo_idx;
int hi_idx;
int hi_mask, lo_mask;
int run_level;
int pred;
int pred_idx;
half2 run;
half2 ac_val;
int lo_ac_val;
int hi_ac_val;
int dc_val;
int run_idx = 0;
array<int> runs(6);
minus_one = 0 - 1;
minus_sixteen = 0 - sixteen;
lo_mask = shift(minus_one, minus_sixteen);
hi_mask = shift(minus_one, sixteen);
i = 3;
loop_count(i) {
lo_idx = data_idx;
hi_idx = data_idx + eight;
ac_val = half2(rle_out[data_idx]);
ac_zero = itocc(int(ac_val == 0));
lo_ac_val = int(ac_val) & lo_mask;
run_level = lo_ac_val;
luminance = itocc(data_idx <= sixteen);
pred_idx = select(luminance, 0, 1);
pred = dc_pred[pred_idx];
dc_val = lo_ac_val - pred;
dc_pred[pred_idx] = select(clust_zero, lo_ac_val, pred);
run_level = select(clust_zero, dc_val, run_level);
rle_out[lo_idx] = run_level;
hi_ac_val = shift(int(ac_val), minus_sixteen);
run_level = hi_ac_val;
pred_idx = select(luminance, 0, two);
pred = dc_pred[pred_idx];
dc_val = hi_ac_val - pred;
dc_pred[pred_idx] = select(clust_zero, hi_ac_val, pred);
run_level = select(clust_zero, dc_val, run_level);
rle_out[hi_idx] = run_level;
data_idx = data_idx + 1;
lo_idx = lo_idx + select(clust_zero, 1, select(ac_zero, 0, 1));
hi_idx = hi_idx + select(clust_zero, 1, shift(int(select(ac_zero, 0, h2_one)), minus_sixteen));
run = half2(select(clust_zero, 0, int(select(ac_zero, h2_one, 0))));
j = 7;
loop_count(j) {
ac_val = half2(rle_out[data_idx]);
ac_zero = itocc(int(ac_val == 0));
run_level = (int(ac_val) & lo_mask) | shift(int(run), sixteen);
rle_out[lo_idx] = run_level;
run_level = shift(int(ac_val),minus_sixteen) | (int(run) & hi_mask);
rle_out[hi_idx] = run_level;
run = select(ac_zero, (run + h2_one), 0);
data_idx = data_idx + 1;
lo_idx = select(ac_zero, lo_idx, lo_idx + 1);
ac_zero = itocc(shift(cctoi(ac_zero), minus_sixteen));
hi_idx = select(ac_zero, hi_idx, hi_idx + 1);
}
runs[0+run_idx] = int(run) & lo_mask;
runs[1+run_idx] = shift(int(run), minus_sixteen);
run_idx = run_idx + two;
all_non_zero = itocc(lo_idx == data_idx);
rle_out[lo_idx] = select( all_non_zero, int(rle_out[lo_idx]), 0);
data_idx = data_idx + eight;
all_non_zero = itocc(hi_idx == data_idx);
rle_out[hi_idx] = select(all_non_zero, int(rle_out[hi_idx]), 0);
}
cc cluster_one;
cc invalid;
cc send_run;
cc all_zeros;
int true1;
int done;
int blocks_left;
int run_token;
int orig_run_level;
int new_run_level;
int block_idx;
uc<int> perm_token = 0x65432100;
cluster_one = itocc(cid() == 1);
true1 = minus_one;
blocks_left = eight - (1 + 1);
invalid = itocc(select(cluster_one, 0, true1));
send_run = clust_zero;
run_idx = 0;
block_idx = 0;
run_token = runs[0];
i = 12;
loop_count(i) {
orig_run_level = rle_out[block_idx];
all_zeros = itocc(orig_run_level == 0);
run_token = commucperm(perm_token, run_token);
new_run_level = orig_run_level + shift(run_token, sixteen);
rle_out[block_idx] = select(invalid, orig_run_level, select(all_zeros, 0, new_run_level));
blocks_left = blocks_left - 1;
done = blocks_left == 0;
run_idx = select(send_run, run_idx + 1, run_idx);
block_idx = select(invalid, block_idx, block_idx + eight);
run_token = select(all_zeros, run_token + eight, runs[run_idx]);
invalid = itocc(select(cluster_one, done, commucperm(perm_token, cctoi(invalid))));
send_run = itocc(select(clust_zero, ~done, commucperm(perm_token, cctoi(send_run))));
}
int rl;
int out_idx, out_idx_tmp;
int my_turn_save;
int my_turn;
int cluster_zero;
int next_turn;
int next_turn_inv;
cc send;
cc my_turn_cc;
block_idx = 0;
i = 6;
loop_count(i) {
cluster_zero = (cid() == 0);
my_turn = cluster_zero;
my_turn_cc = itocc(my_turn);
rl = rle_out[block_idx];
out(my_turn_cc) << rl;
out_idx = select(my_turn_cc, block_idx + 1, block_idx);
block_idx = block_idx + eight;
next_turn = 0;
cc loopcc = itocc(minus_one);
loop_while_any(loopcc) {
stage(1);
rl = rle_out[out_idx];
next_turn = my_turn & ((rl == 0) | ((block_idx == out_idx) | next_turn));
my_turn_cc = itocc(my_turn);
out_idx = select(my_turn_cc, (out_idx + 1), out_idx);
stage(2);
next_turn_inv = ~next_turn;
send = itocc(my_turn & next_turn_inv);
my_turn = cluster_zero | commucperm(perm_token, next_turn);
loopcc = itocc(next_turn_inv);
stage(3);
out(send) << rl;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -