📄 mpeg_kc.cpp

📁 H.264完整的C语言代码和DCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
  rle_out[0+store_idx] = int(CrCbY4Y3Y2Y1_b[0+send_idx]);
  rle_out[16+store_idx] = int(CrCbY4Y3Y2Y1_b[8+send_idx]);
  rle_out[32+store_idx] = int(CrCbY4Y3Y2Y1_b[16+send_idx]);

  store_idx = store_idx_arr[1];
  send_idx = send_idx_arr[1];
  rle_out[0+store_idx] =  int(commucperm(perm_1, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[16+send_idx]));

  store_idx = store_idx_arr[2];
  send_idx = send_idx_arr[2];
  rle_out[0+store_idx] =  int(commucperm(perm_2, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[16+send_idx]));

  store_idx = store_idx_arr[3];
  send_idx = send_idx_arr[3];
  rle_out[0+store_idx] =  int(commucperm(perm_3, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[16+send_idx]));



  // temporary, to get around rf allocation problems
  i = 0;
  loop_count(i) {
    int g = 0;
  }


  store_idx = store_idx_arr[4];
  send_idx = send_idx_arr[4];
  rle_out[0+store_idx] =  int(commucperm(perm_4, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[16+send_idx]));

  store_idx = store_idx_arr[5];
  send_idx = send_idx_arr[5];
  rle_out[0+store_idx] =  int(commucperm(perm_5, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[16+send_idx]));

  store_idx = store_idx_arr[6];
  send_idx = send_idx_arr[6];
  rle_out[0+store_idx] =  int(commucperm(perm_6, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[16+send_idx]));

  store_idx = store_idx_arr[7];
  send_idx = send_idx_arr[7];
  rle_out[0+store_idx] =  int(commucperm(perm_7, CrCbY4Y3Y2Y1_b[0+send_idx]));
  rle_out[16+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[8+send_idx]));
  rle_out[32+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[16+send_idx]));


  // First, each cluster independently does the run-level encoding for
  // the eight values of each block that that cluster has. Later, the
  // "mini-blocks" of eight will be patched up. To do the run level encoding,
  // a variable called run is used to keep track of the current run. The
  // final value of this variable needs to be saved and will be used to
  // "patch up" the encoding for the whole block. Also, only the run-level
  // pairs are recorded. All zero elements are removed. Thus, not all eight
  // locations within a block hold useful data at the end of this operation.
  // A zero is used to mark the first non-useful data (if there are less
  // than eight actual run-level pairs). Storing only the actual run-level
  // pairs speeds up the condition output stream loop later because it
  // doesn't necessarily have to check all eight values, and thus the loop
  // can break early if possible. Cluster zero needs to deal with
  // the dc value, which is the first element in its array. The dc value
  // is stored as a difference between the current value and the prediction,
  // which is the previous dc value of a similar block. Thus there are three
  // different dc predictions, Y, Cb, and Cr. After using the prediction,
  // the current value is saved and becomes the prediction for next time.
  // The first four blocks use the Y prediction, and the next two use the
  // Cb and Cr predictions respectively. Finally, the data is initially
  // stored in half2 format with each 16 bit half containing an element
  // from a different block. Thus, each loop iteration, each cluster run
  // level encodes eight values from two different blocks at the same time.
  // The first block's results overwrite the original 8 elements, and the
  // second block's results will occupy the eight locations that start
  // right after the source elements. Thus, for example, Y4Y3 originally
  // is in locations 16-23, and the results for Y3 will be in 16-23, and
  // those for Y4 will be in 24-31. (Actually, there may be less that eight
  // results for each block, but the layout remains the same...)

  uc<int> j, k;    // loop vars
  cc clust_zero = itocc(cid() == 0);     // cluster 0 ?
  cc ac_zero;        // original half2 elements == 0 ??
  cc luminance;      // are we dealing with Y blocks? True for 1st 2 iter.
  cc all_non_zero;   // Were all 8 values in this cluster != 0 for this block
  int data_idx = 0;  // source data index
  int lo_idx;        // result data index for first block
  int hi_idx;        // result data index for second block
  int hi_mask, lo_mask;  // 16 bit masks
  int run_level;     // high 16 bits = run, low 16 bits = level
  int pred;          // dc prediction for this block type
  int pred_idx;      // 0 for Y4,Y3,Y2,Y1, 1 for Cb, 2 for Cr
  half2 run;         // current run of zeros
  half2 ac_val;      // original levels (zero and non zero)
  int lo_ac_val;     // ac value for first block
  int hi_ac_val;     // ac value for second block
  int dc_val;        // current dc value - predicted dc value
  int run_idx = 0;   // 0-5 for Y1,Y2,Y3,Y4,Cb,CR respectively
  array<int> runs(6); // final vals of the run variable; used for "patching up"

  minus_one = 0 - 1;
  minus_sixteen = 0 - sixteen;
  lo_mask = shift(minus_one, minus_sixteen);
  hi_mask = shift(minus_one, sixteen);

  i = 3;
  loop_count(i) {
    lo_idx = data_idx;
    hi_idx = data_idx + eight;

    // get values and check if they == 0
    ac_val = half2(rle_out[data_idx]);
    ac_zero = itocc(int(ac_val == 0));

    // first run is 0, so only set level - will be fixed by "patch up" later
    lo_ac_val = int(ac_val) & lo_mask;
    run_level = lo_ac_val;

    // in case this is a dc value, do dc prediction
    luminance = itocc(data_idx <= sixteen);
    pred_idx = select(luminance, 0, 1);
    pred = dc_pred[pred_idx];
    dc_val = lo_ac_val - pred;
    dc_pred[pred_idx] = select(clust_zero, lo_ac_val, pred);

    // save dc prediction if cluster 0, else store the ac level
    run_level = select(clust_zero, dc_val, run_level);
    rle_out[lo_idx] = run_level;

    // do the same for second block
    hi_ac_val = shift(int(ac_val), minus_sixteen);
    run_level = hi_ac_val;

    // second block dc value prediction
    pred_idx = select(luminance, 0, two);
    pred = dc_pred[pred_idx];
    dc_val = hi_ac_val - pred;
    dc_pred[pred_idx] = select(clust_zero, hi_ac_val, pred);

    // store second block dc value if cluster 0, else ac level
    run_level = select(clust_zero, dc_val, run_level);
    rle_out[hi_idx] = run_level;

    // update source data index
    data_idx = data_idx + 1;

    // result pointers update unconditionally if it was a dc value; else
    // if it was an ac value, they only increment if the level != 0
    lo_idx = lo_idx + select(clust_zero, 1, select(ac_zero, 0, 1));
    hi_idx = hi_idx + select(clust_zero, 1, shift(int(select(ac_zero, 0, h2_one)), minus_sixteen));

    // update the current run of zeros, and reset to 0 if level was non_zero
    run = half2(select(clust_zero, 0, int(select(ac_zero, h2_one, 0))));

    // all values are now ac values, so don't worry about dc stuff
    j = 7;
    loop_count(j) {
      // get values and check if they == 0
      ac_val = half2(rle_out[data_idx]);
      ac_zero = itocc(int(ac_val == 0));

      // encode run-level pair for the two blocks and store them
      run_level = (int(ac_val) & lo_mask) | shift(int(run), sixteen);
      rle_out[lo_idx] = run_level;
      run_level = shift(int(ac_val),minus_sixteen) | (int(run) & hi_mask);
      rle_out[hi_idx] = run_level;

      // update the run variable, and reset to 0 if level != 0
      run = select(ac_zero, (run + h2_one), 0);

      // update source data index
      data_idx = data_idx + 1;

      // update the result data indices if level != 0
      lo_idx = select(ac_zero, lo_idx, lo_idx + 1);
      ac_zero = itocc(shift(cctoi(ac_zero), minus_sixteen));
      hi_idx = select(ac_zero, hi_idx, hi_idx + 1);
    }

    // save final value of the run variable for each block
    runs[0+run_idx] = int(run) & lo_mask;
    runs[1+run_idx] = shift(int(run), minus_sixteen);
    run_idx = run_idx + two;

    // if not all the values for non-zero, store a zero to indicate the end
    // of actual run-level pairs for the first block.
    all_non_zero = itocc(lo_idx == data_idx);
    rle_out[lo_idx] = select( all_non_zero, int(rle_out[lo_idx]), 0);

    // update the source data index
    data_idx = data_idx + eight;

    // if not all the values for non-zero, store a zero to indicate the end
    // of actual run-level pairs for the second block.
    all_non_zero = itocc(hi_idx == data_idx);
    rle_out[hi_idx] = select(all_non_zero, int(rle_out[hi_idx]), 0);
  }



  // Now, patch up the run-level encoding. This is done by patching up the
  // the cluster 0/cluster 1 data first, and then the cluster 1/cluster 2
  // data, etc. However it is pipelined so that the patching for all 6
  // blocks are done with as much parallelism as possible. Thus, while
  // the cluster 1/cluster 2 data is being patched up for the first block,
  // the cluster 0/cluster 1 data will be patched up for the second block,
  // and so on. The invalid cc tells the cluster when it has a valid run
  // from the previous cluster. Thus cluster 7 will be invalid the first
  // 6 iterations, and cluster 1 will be invalid the last 6. Cluster 0 is
  // always invalid because it doesn't need to have any of its data patched
  // up (ie, there is no previous cluster from which to get a run). The
  // invalid cc is passed from cluster to cluster via communication. It
  // originates in cluster 1, and cluster 1 sets it to false until it has
  // finished all its blocks. It starts as true in every other cluster.
  // Eventually the invalid flag will be false in every cluster, and then
  // true again as the value cluster 1 sets it to percolates to the rest
  // of the clusters.

  cc cluster_one;     // cluster 1 ??
  cc invalid;         // false when a cluster has a valid run to work with
  cc send_run;        // true if cluster needs to send its run to next cluster
  cc all_zeros;       // true if the cluster had all zeros for this block
  int true1;           // = 0xFFFF; used for setting cc's to TRUE
  int done;           // non-zero when cluster 1 has patched up all the blocks
  int blocks_left;    // how many blocks cluster 1 has left to patch up
  int run_token;      // run of zeros, passed from cluster to cluster
  int orig_run_level; // the run-level calculated above
  int new_run_level;  // the run-level adjusted with the final run from the
                      // previous cluster
  int block_idx;      // index to the blocks, 0,8,16,24,32,40
  uc<int> perm_token = 0x65432100;   // used to comm. invalid and run_token
                                     // sends value to next cluster

  cluster_one = itocc(cid() == 1);
  true1 = minus_one;
  blocks_left = eight - (1 + 1);
  invalid = itocc(select(cluster_one, 0, true1));
  send_run = clust_zero;
  run_idx = 0;         // index into final run values, 1 per block
  block_idx = 0;
  run_token = runs[0];

  i = 12;           // 7 patches per block; 6 blocks -> 7 + (6-1) = 12
  loop_count(i) {
    // get the run-level pair calculated above and see if it equals zero
    orig_run_level = rle_out[block_idx];
    all_zeros = itocc(orig_run_level == 0);

    // get the final run of zeros of the previous cluster
    run_token = commucperm(perm_token, run_token);

    // adjust first run-level pair of this cluster
    new_run_level = orig_run_level + shift(run_token, sixteen);

    // and store it if the previous cluster's run was valid; if it wasn't,
    // store the original value back again
    rle_out[block_idx] = select(invalid, orig_run_level, select(all_zeros, 0, new_run_level));

    // any blocks left?
    blocks_left = blocks_left - 1;
    done = blocks_left == 0;

    // update the idx to the run values and to the block data
    run_idx = select(send_run, run_idx + 1, run_idx);
    block_idx = select(invalid, block_idx, block_idx + eight);

    // if this cluster was all zeros, update run and pass it along, else
    // pass the final run of zeros for this cluster to the next
    run_token = select(all_zeros, run_token + eight, runs[run_idx]);

    // pass the invalid "token" on to the next cluster, and if this is
    // cluster 1, mark it true if no more blocks left
    invalid = itocc(select(cluster_one, done, commucperm(perm_token, cctoi(invalid))));

    // pass the send_run token, except mark it false if cluster 0 and done
    send_run = itocc(select(clust_zero, ~done, commucperm(perm_token, cctoi(send_run))));
  }

  // Output the results using a conditional output stream. This is an
  // inherently serial operation. The outermost loop goes through the
  // six blocks of each macroblock. This loop first makes cluster 0
  // output the dc value. Then the remaining (up to 63) values are output.
  // A while_any loop is used. Each cluster outputs all the data it has
  // and then passes the "token" (my_turn) on to the next cluster. Thus
  // there is only one loop iteration per run-level pair in the block,
  // and this may be less than 64 iterations. The loop is software pipelined
  // such that all the bookkeeping is done on the first "iteration." Every
  // cycle this "iteration" of the loop keeps track of which cluster should
  // be outputting, reads the data to output by a later "iteration" of the
  // software pipelined loop, and updates the scratchpad index. The second
  // "iteration" of the loop calculates the conditional output stream
  // condition. The final "iteration" sets the loop condition and does the
  // conditional stream output operations. Since the loop condition is set
  // only in the final "iteration" of the software pipeline, and the only
  // changing of any state also occurs in the last "iteration", software
  // pipelining the loop will work withough adjusting how the loop condition
  // is calculated. Thus, there will have to be 2 extra real iterations of
  // the loop for every block.

  int rl;           // the run_level value; if == 0 don't send it
  int out_idx, out_idx_tmp;  // index into the data
  int my_turn_save; // save the value of my_turn before it is updated (scratch)
  int my_turn;      // -1 if it IS OR HAS BEEN my turn for this block
  int cluster_zero; // -1 for cluster 0, zero for the rest
  int next_turn;    // -1 if out of data, next cluster's turn for this block
  int next_turn_inv;  // -1 if out of data, next cluster's turn for this block
  cc send;          // I have valid data to output (it IS my turn to output)
  cc my_turn_cc;    // holds my_turn for SELECTs and costreams

  block_idx = 0;
  i = 6;
  loop_count(i) {
    cluster_zero = (cid() == 0);
    my_turn = cluster_zero;
    my_turn_cc = itocc(my_turn);

    // must send dc coefficient
    rl = rle_out[block_idx];
    out(my_turn_cc) << rl;
    out_idx = select(my_turn_cc, block_idx + 1, block_idx);
    block_idx = block_idx + eight;
    next_turn = 0;

    cc loopcc = itocc(minus_one);

    loop_while_any(loopcc) { // pipeline(1) {

    stage(1);

      // get next value
      rl = rle_out[out_idx];

      // is this cluster done yet?
      next_turn = my_turn & ((rl == 0) | ((block_idx == out_idx) | next_turn));

      // update index if it is your turn to output
      my_turn_cc = itocc(my_turn);
      out_idx = select(my_turn_cc, (out_idx + 1), out_idx);

    stage(2);

      // send if it is my_turn but not the next cluster's turn yet
      next_turn_inv = ~next_turn;
      send = itocc(my_turn & next_turn_inv);

      // communicate to next clusters if it is their turn to output now
      my_turn = cluster_zero | commucperm(perm_token, next_turn);

      // loop until all clusters have signalled that they are done
      loopcc = itocc(next_turn_inv);

    stage(3);

      out(send) << rl;
    }
    //  out(cluster_one) << minus_one;   // End of Block ????
  }
  // flush(out, minus_one + 1);      // NULL value ????
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -