📄 motion_est_core_2_optimized.c
字号:
pCodec->ME_command_queue0[23] = (3<<29) | (d_type << 24) | (Diamond_search_limit << 16) | ThEES; // dsize is always small
pCodec->ME_command_queue0[28] = (2<<29) | ((pMB->mv16x_1&0x7f)<<19) | ((pMB->mv16y_1&0x7f)<<12) | (pCodec->Raddr + (pMB->mv16y_1 < 0 ? 0 : (pMB->mv16y_1 >> 1)*16));
pCodec->ME_command_queue0[31] = (1<<28) | (2<<29) | ((prevMB->mv16x_0 & 0x7f) << 19) | ((prevMB->mv16y_0 & 0x7f) << 12)
| pCodec->Raddr + (prevMB->mv16y_0 < 0 ? 0 : (prevMB->mv16y_0 >> 1)*16);
pCodec->ME_command_queue0[40] = (1<<28) | (2<<29) | ((prevMB->mv16x_1 & 0x7f) << 19) | ((prevMB->mv16y_1 & 0x7f) << 12)
| pCodec->Raddr + (prevMB->mv16y_1 < 0 ? 0 : (prevMB->mv16y_1 >> 1)*16);
pCodec->ME_command_queue0[46] = (2<<29) | ((pMB->mv16x_3&0x7f)<<19) | ((pMB->mv16y_3&0x7f)<<12) | (pCodec->Raddr23 + (pMB->mv16y_3 < -16 ? ((-16>>1) * 16) : (pMB->mv16y_3 >> 1)*16));
pCodec->ME_command_queue0[49] = (1<<28) | (2<<29) | ((prevMB->mv16x_2 & 0x7f) << 19) | ((prevMB->mv16y_2 & 0x7f) << 12)
| pCodec->Raddr23 + (prevMB->mv16y_2 < -16 ? ((-16>>1) * 16) : (prevMB->mv16y_2 >> 1)*16);
pCodec->ME_command_queue0[58] = (1<<28) | (2<<29) | ((prevMB->mv16x_3 & 0x7f) << 19) | ((prevMB->mv16y_3 & 0x7f) << 12)
| pCodec->Raddr23 + (prevMB->mv16y_3 < -16 ? ((-16>>1) * 16) : (prevMB->mv16y_3 >> 1)*16);
// what's the bit 11 is doing ??? there's no bit 11 for MECTL control register
// after asking James and Albert, there is no bit 11, we can remove it after the code is stable
X = current1->rounding_type << 2 | (1 << 11) | 1; // 1<< 4 => fcode = 1
pCodec->ME_COMMAND = X;
/* end setup for next ME */
return 0;
}
int32_t
MotionEstimation_4MV(MACROBLOCK *const pMB,
MACROBLOCK *const pMB_mc,
uint32_t x, uint32_t y,
MBParam * const pParam,
Encoder *pEnc,
int32_t counter)
{
FTMCP100_CODEC *pCodec=(FTMCP100_CODEC *)pEnc->pCodec;
uint32_t iWcount;
MACROBLOCK *prevMB, *pMB_tmp;
VECTOR pmv[4];
int32_t index;
int32_t X, d_type;
int32_t MOTION_ACTIVITY;
int32_t tmp1, tmp2, tmp3, tmp4;
volatile MDMA *pmdma = MDMA1;
DECLARE_MP4_PTR
FRAMEINFO * const current1=pEnc->current1;
FRAMEINFO * const reference=pEnc->reference;
unsigned int XDIM=pEnc->mEncParam.u32FrameWidth;
// to get the 'current' and 'not current' half of DMA double command buffer
unsigned int *pDMA_cur,*pDMA_next;
pDMA_cur=pCodec->DMA_COMMAND_local+(pCodec->even_odd_1^1)*DMA_COMMAND_QUEUE_STRIDE;
pDMA_next=pCodec->DMA_COMMAND_local+(pCodec->even_odd_1)*DMA_COMMAND_QUEUE_STRIDE;
// since we have synchronized the DMA commands entries 28th~39th (or 76th~87th)
// to MC done , so checking DMA done is equivalent to checking MC done
POLL_MARKER_S
while((pmdma->Status & 0x1) == 0) {}
POLL_MARKER_E
SET_HOFFSET(((counter-1)&0x3)*16)
SET_MECADDR(CUR_Y0+pCodec->even_odd_1*384)
SET_MEIADDR((pCodec->triple_buffer_selector)*384 + INTER_Y0)
// for optimization, we invoke ME go here instead after MC go
SET_MECTL(pCodec->ME_COMMAND) // ME GO ************************
iWcount = pParam->mb_width;
// MB_mode = 1 means inter mode, 0 means intra mode
if (pCodec->MB_mode) {
index = (MBTransQuantInter(pParam, current1,pCodec)) | (pCodec->MVZ << 16) | (pMB_mc->mode<<6);
} else {
if (pParam->resyn==0)
{
pMB_tmp = pMB_mc;
pCodec->acdc_status = 0;
if ((y==0) || ((y==1) && (x==0))) {
pCodec->acdc_status = 6;
pMB_tmp = pMB_tmp - 1;
if (pMB_tmp->mode != MODE_INTRA) // check Left
pCodec->acdc_status |= 1;
} else if (x==1) {
pCodec->acdc_status = 5;
pMB_tmp = pMB_tmp - (XDIM/16); // check Top
if (pMB_tmp->mode != MODE_INTRA)
pCodec->acdc_status |= 2;
} else {
pMB_tmp = pMB_tmp - 1;
if (pMB_tmp->mode != MODE_INTRA) // check Left
pCodec->acdc_status |= 1;
pMB_tmp = pMB_tmp - (XDIM/16); // check Top-Left
if (pMB_tmp->mode != MODE_INTRA)
pCodec->acdc_status |= 4;
pMB_tmp += 1;
if (pMB_tmp->mode != MODE_INTRA) // check Top
pCodec->acdc_status |= 2;
}
}
else
{
pCodec->acdc_status = 6;
pMB_tmp = pMB_mc - 1;
if ((pMB_tmp->mode != MODE_INTRA) | (x==1))
pCodec->acdc_status |= 1;
}
index = MBTransQuantIntra_p(pParam, current1,pCodec);
}
SET_MCCTL(index) // MC GO
// begin to execute the DMA commands prepared in advance (from 0th to 0+44(0x2C)-1=43th)
DMA_MOVE((pCodec->even_odd_1^1)*DMA_COMMAND_QUEUE_STRIDE, 0x4B0002C)
// to prepare the 'not current' DMA buffer command
pDMA_next[1] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS((uint32_t) REF_Y + ((counter+2)&0x3)*16);
pDMA_next[5] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS((uint32_t) REF_U + ((counter+2)&0x3)*8);
pDMA_next[9] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS((uint32_t) REF_V + ((counter+2)&0x3)*8);
pDMA_next[37] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(LOCAL_PREDICTOR0 + (pCodec->even_odd_I)*0x80);
pDMA_next[40] = pDMA_cur[36];
if(x==pEnc->mbParam.mb_width-1) { pDMA_next[36] = (uint32_t) (pCodec->pred_value_phy); } else {pDMA_next[36] = (uint32_t) (pCodec->pred_value_phy + (x+1)*32); }
// make it a chain
pDMA_next[39] = 0x04A42010; // make it group 2, to sync to MC done
{
unsigned int offset=((pCodec->triple_buffer_selector+2)%3)*384;
//pDMA_next[25] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(((pCodec->triple_buffer_selector+2)%3)*384 + INTER_Y0);
//pDMA_next[29] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(((pCodec->triple_buffer_selector+2)%3)*384 + INTER_U0);
//pDMA_next[33] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(((pCodec->triple_buffer_selector+2)%3)*384 + INTER_V0);
pDMA_next[25] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(offset + INTER_Y0);
pDMA_next[29] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(offset + INTER_U0);
pDMA_next[33] = TRANSLATE_LOCAL_MEMORY_BASE_ADDRESS(offset + INTER_V0);
// well, originally, we want to make the DMA transfer of reconstructed YUV image to be moved to
// system memory with no synchronization. But we found that one problem occurs due to the fact that
// ME engine will copy the current MB to the another bank 0 current MB buffer once MC is done , but
// sometimes when quantizer is 1 and MC period is long and CPU is running fast enough, the program
// will invoke the ME go or MC go before the operation of copying current MB to another bank 0 current
// MB buffer is done. This will cause problem and make some patterns fail in FPGA platform.
// There's no way to synchronize this copying operation at this point. Unless we add delay mechanism
// after MC go.
// Anyway, in order to avoid such condition, so we make reconstructed V component to sync to MC done
// in order to let the copying operation has enough time to complete before ME go or MC go.
// make it group 0, to normal execution
//pDMA_next[27] = 0x04B00040;
//pDMA_next[31] = 0x04B00010;
//pDMA_next[35] = 0x04B00010;
// make it group 2, make the V component sync to MC done
pDMA_next[27] = 0x04B00040;
pDMA_next[31] = 0x04B02010; // to make U component sync to MC done in order to let hardware has enough time to move the current MB to bank 0
pDMA_next[35] = 0x04B02010;// to make V component sync to MC done in order to let hardware has enough time to move the current MB to bank 0
}
pCodec->even_odd_I ^= 1;
X = current1->rounding_type << 2 | (1 << 11) | 1; // 1<< 4 => fcode = 1
pCodec->ME_COMMAND = X;
prevMB = &reference->mbs[counter];
index = pCodec->Raddr;
get_pmvdata0_4MV(current1->mbs, iWcount, x+1, counter, pmv, pParam);
MOTION_ACTIVITY=MAX(abs(pmv[1].x)+abs(pmv[1].y),abs(pmv[3].x)+abs(pmv[3].y));
d_type = (MOTION_ACTIVITY>L1);
/************* check ME_DONE *************/
POLL_ME_DONE_MARKER_START
do {
READ_CPSTS(X)
} while(!(X&0x01));
POLL_ME_DONE_MARKER_END
#ifdef DUMP_PMV_RESULT
if(1) {
int index_array[20]={ 19,20,21,22, 28,29,30,31, 37,38,39,40, 46,47,48,49, 55,56,57,58 };
int c;
int x_result[20],y_result[20],radd_result[20];
int32_t sad_value;
int32_t XX;
static int mv_buffer_selector=0;
mv_buffer_selector^=1;
for(c=0;c<20;c++)
{
sad_value=pCodec->ME_command_queue0[index_array[c]];
if(sad_value & (1<<26))
{ // SRC==1
int src_index=(sad_value>>19)&0x7f;
XX = (pCodec->ME_command_queue0[src_index+(mv_buffer_selector)*4]>>12);
x_result[c]=(XX<<18)>>25;
y_result[c]=(XX<<25)>>25;
}
else
{ // SRC==0
XX = (pCodec->ME_command_queue0[index_array[c]]>>12);
x_result[c]=(XX<<18)>>25;
y_result[c]=(XX<<25)>>25;
}
radd_result[c]=sad_value&0x0fff;
}
for(c=0;c<20;c+=4)
{
#ifdef DUMP_WITH_RADDR
fprintf(pmv_result_file,"\n");
fprintf(pmv_result_file,"Left candidate : 0x%04x, 0x%04x (RADDR:0x%04x)\n",x_result[c]&0x07f,y_result[c]&0x07f,radd_result[c]);
fprintf(pmv_result_file,"Top candidate : 0x%04x, 0x%04x (RADDR:0x%04x)\n",x_result[c+1]&0x07f,y_result[c+1]&0x07f,radd_result[c+1]);
fprintf(pmv_result_file,"Top-Right candidate : 0x%04x, 0x%04x, (RADDR:0x%04x)\n",x_result[c+2]&0x07f,y_result[c+2]&0x07f,radd_result[c+2]);
fprintf(pmv_result_file,"Previous MB candidate : 0x%04x, 0x%04x, (RADDR:0x%04x)\n",x_result[c+3]&0x07f,y_result[c+3]&0x07f,radd_result[c+3]);
#else
fprintf(pmv_result_file,"\n");
fprintf(pmv_result_file,"Left candidate : 0x%04x, 0x%04x\n",x_result[c]&0x07f,y_result[c]&0x07f);
fprintf(pmv_result_file,"Top candidate : 0x%04x, 0x%04x\n",x_result[c+1]&0x07f,y_result[c+1]&0x07f);
fprintf(pmv_result_file,"Top-Right candidate : 0x%04x, 0x%04x\n",x_result[c+2]&0x07f,y_result[c+2]&0x07f);
fprintf(pmv_result_file,"Previous MB candidate : 0x%04x, 0x%04x\n",x_result[c+3]&0x07f,y_result[c+3]&0x07f);
#endif
#if 0
fprintf(pmv_result_file,"\n");
fprintf(pmv_result_file,"Left candidate : (RADDR:0x%04x)\n",radd_result[c]);
fprintf(pmv_result_file,"Top candidate : (RADDR:0x%04x)\n",radd_result[c+1]);
fprintf(pmv_result_file,"Top-Right candidate : (RADDR:0x%04x)\n",radd_result[c+2]);
fprintf(pmv_result_file,"Previous MB candidate : (RADDR:0x%04x)\n",radd_result[c+3]);
#endif
}
#if 0
fprintf(pmv_result_file,"\nFour motion vectors are :\n");
XX = (pCodec->ME_command_queue0[0+(pCodec->even_odd_1)*4]>>12);
fprintf(pmv_result_file,"0x%04x,0x%04x,",((XX<<18)>>25)& 0x7f,((XX<<25)>>25)&0x7f);
XX = (pCodec->ME_command_queue0[1+(pCodec->even_odd_1)*4]>>12);
fprintf(pmv_result_file,"0x%04x,0x%04x,",((XX<<18)>>25)& 0x7f,((XX<<25)>>25)&0x7f);
XX = (pCodec->ME_command_queue0[2+(pCodec->even_odd_1)*4]>>12);
fprintf(pmv_result_file,"0x%04x,0x%04x,",((XX<<18)>>25)& 0x7f,((XX<<25)>>25)&0x7f);
XX = (pCodec->ME_command_queue0[3+(pCodec->even_odd_1)*4]>>12);
fprintf(pmv_result_file,"0x%04x,0x%04x",((XX<<18)>>25)& 0x7f,((XX<<25)>>25)&0x7f);
fprintf(pmv_result_file,"\n");
#endif
}
#endif
//#ifdef DUMP_ME_RESULT
//READ_MIN_SADMV(pMB->sad16)
//READ_CURDEV(mode_decision)
//#endif
if (X & 0x80) {
pCodec->MB_mode = 0;
pMB->mode = MODE_INTRA;
pMB->mv16x_0 = 0;
pMB->mv16y_0 = 0;
pMB->mv16x_1 = 0;
pMB->mv16y_1 = 0;
pMB->mv16x_2 = 0;
pMB->mv16y_2 = 0;
pMB->mv16x_3 = 0;
pMB->mv16y_3 = 0;
} else {
pCodec->MB_mode = 1;
if (X & 64) {
pMB->mode = MODE_INTER;
X = (pCodec->ME_command_queue0[3+(pCodec->even_odd_1)*4]);
X = X >> 12;
pMB->mv16x_0 = (X<<18)>>25;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -