idct_ap528mmx.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,169 行 · 第 1/3 页
CPP
1,169 行

movq mm4, mm6
punpckldq mm7, mm5                              ; tmt0

punpckhdq mm2, mm5                              ; tmt2 ; free mm5
;slot

; shuffle the rest of the data, and write it with 2 qword ptr writes
punpckldq mm6, mm3                              ; tmt4
;move from next block
movq mm5, mm2                                   ; duplicate tmt2

punpckhdq mm4, mm3                              ; tmt6 ; free mm3
;move from next block
movq mm3, mm0                                   ; duplicate tmt10

; column 0: odd part (after transpose)
;moved up to prev block
;movq mm3, mm0                                  ; duplicate tmt10
;movq mm5, mm2                                  ; duplicate tmt2

psubsw mm0, mm4                                 ; V110
paddsw mm3, mm4                                 ; V113 ; free mm4

movq mm4, mm0                                   ; duplicate V110
paddsw mm2, mm1                                 ; V111

pmulhw mm0, qword ptr x539f539f539f539f        ; 21407-> V117
psubsw mm5, mm1                                 ; V112 ; free mm1

psubsw mm4, mm5                                 ; V116
movq mm1, mm2                                   ; duplicate V111

pmulhw mm5, qword ptr x4546454645464546        ; 17734-> V119
psubsw mm2, mm3                                 ; V114

pmulhw mm4, qword ptr x61f861f861f861f8        ; 25080-> V120
paddsw mm1, mm3                                 ; V115 ; free mm3

pmulhw mm2, qword ptr x5a825a825a825a82        ; 23170-> V118
psllw mm0, 2                                    ; t266

movq qword ptr[esi+8*0], mm1                   ; save V115
psllw mm5, 1                                    ; t268

psubsw mm5, mm4                                 ; V122
psubsw mm4, mm0                                 ; V121 ; free mm0

psllw mm5, 1                                    ; t270
;slot

psubsw mm5, mm1                                 ; V123 ; free mm1
psllw mm2, 2                                    ; t272

psubsw mm2, mm5                                 ; V124 (keep V123)
psllw mm4, 1                                    ; t274


movq qword ptr[esi+8*2], mm5                   ; save V123 ; free mm5
paddsw mm4, mm2                                 ; V125 (keep V124)

; column 0: even part (after transpose)
movq mm0, qword ptr[esi+8*12]                  ; tmt12
movq mm3, mm6                                   ; duplicate tmt4

psubsw mm6, mm0                                 ; V100
paddsw mm3, mm0                                 ; V101 ; free mm0

pmulhw mm6, qword ptr  x5a825a825a825a82       ; 23170 ->V102
movq mm5, mm7                                   ; duplicate tmt0

movq mm1, qword ptr[esi+8*8]                   ; tmt8
;slot

paddsw mm7, mm1                                 ; V103
psubsw mm5, mm1                                 ; V104 ; free mm1

movq mm0, mm7                                   ; duplicate V103
psllw mm6, 2                                    ; t245

paddsw mm7, mm3                                 ; V106
movq mm1, mm5                                   ; duplicate V104

psubsw mm6, mm3                                 ; V105
psubsw mm0, mm3                                 ; V109; free mm3   

paddsw mm5, mm6                                 ; V107
psubsw mm1, mm6                                 ; V108 ; free mm6

; column 0: output butterfly (after transform)
movq mm3, mm1                                   ; duplicate V108
paddsw mm1, mm2                                 ; out4

psraw mm1, 4
psubsw mm3, mm2                                 ; out10 ; free mm2

psraw mm3, 4
movq mm6, mm0                                   ; duplicate V109

movq qword ptr[esi+8*4], mm1                   ; out4 ; free mm1
psubsw mm0, mm4                                 ; out6

movq qword ptr[esi+8*10], mm3                  ; out10 ; free mm3
psraw mm0, 4

paddsw mm6, mm4                                 ; out8 ; free mm4
movq mm1, mm7                                   ; duplicate V106

movq qword ptr[esi+8*6], mm0                   ; out6 ; free mm0
psraw mm6, 4

movq mm4, qword ptr[esi+8*0]                   ; V115
;slot

movq qword ptr[esi+8*8], mm6                   ; out8 ; free mm6
movq mm2, mm5   ; duplicate V107

movq mm3, qword ptr[esi+8*2]                   ; V123
paddsw mm7, mm4                                 ; out0

;moved up from next block
movq mm0, qword ptr scratch3
psraw mm7, 4

;moved up from next block
movq mm6, qword ptr scratch5
psubsw mm1, mm4                                 ; out14 ; free mm4

paddsw mm5, mm3                                 ; out2
psraw mm1, 4

movq qword ptr[esi], mm7                       ; out0 ; free mm7
psraw mm5, 4

movq qword ptr[esi+8*14], mm1                  ; out14 ; free mm1
psubsw mm2, mm3                                 ; out12 ; free mm3

movq qword ptr[esi+8*2], mm5                   ; out2 ; free mm5
psraw mm2, 4

;moved up to the prev block
movq mm4, qword ptr scratch7
;moved up to the prev block
psraw mm0, 4

movq qword ptr[esi+8*12], mm2                  ; out12 ; free mm2
;moved up to the prev block
psraw mm6, 4

;move back the data to its correct place
;moved up to the prev block
;movq mm0, qword ptr scratch3
;movq mm6, qword ptr scratch5
;movq mm4, qword ptr scratch7
;psraw mm0, 4
;psraw mm6, 4

movq mm1, qword ptr scratch1
psraw mm4, 4

movq qword ptr [esi+8*3], mm0          ; out3
psraw mm1, 4

movq qword ptr [esi+8*5], mm6          ; out5
;slot

movq qword ptr [esi+8*7], mm4          ; out7
;slot

movq qword ptr [esi+8*1], mm1          ; out1
;slot

  
	// done with the iDCT column-transformation

	// now we have to transpose the output 8x8 matrix
	//    8x8 (OUT) -> 8x8't' (IN)

	// the transposition is implemented as 4 sub-operations.
	// 1) transpose upper-left quad  (in-place)
	// 2) transpose lower-right quad (in-place)
	// 3) transpose lower-left quad  (copy to scratch regs)
	// 4) transpose upper-right quad (in-place)
	// 5) copy scratch-regs to lower-left quad


 
	// mm0 = 1st row [ A B C D ] row1
	// mm1 = 2nd row [ E F G H ] 2
	// mm2 = 3rd row [ I J K L ] 3
	// mm3 = 4th row [ M N O P ] 4

	// 1) transpose upper-left quad
	movq mm0, qword ptr [esi + ROW_STRIDE * 0 ]

	movq mm1, qword ptr [esi + ROW_STRIDE * 1 ]
	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
	
	movq mm2, qword ptr [esi + ROW_STRIDE * 2 ]
	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
	
	movq mm3, qword ptr [esi + ROW_STRIDE * 3]
	 punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]

	movq mm6, mm2;
	 punpcklwd mm2, mm3;	// mm2 = [ 8 12 9 13]

	punpckhwd mm6, mm3;	// mm6 = 10 14 11 15]
	 movq mm1, mm0;	// mm1 = [ 0 4 1 5]

	punpckldq mm0, mm2;	// final result mm0 = row1 [0 4 8 12]

	movq mm3, mm4;	// mm3 = [ 2 6 3 7]
	 punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]

	movq qword ptr [ esi + ROW_STRIDE * 0 ], mm0; // store row 1
	 punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (lower-right)
	movq mm0, qword ptr [esi + ROW_STRIDE*4 + 8]; 
	 punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

	movq qword ptr [ esi +ROW_STRIDE * 2], mm4; // store row 3
	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]

	movq qword ptr [ esi +ROW_STRIDE * 1], mm1; // store row 2

	movq mm1, qword ptr [esi + ROW_STRIDE*5 + 8]

	movq qword ptr [ esi +ROW_STRIDE * 3], mm3; // store row 4
	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

	// 2) transpose lower-right quadrant

//	movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//	movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
	
	movq mm2, qword ptr [esi + ROW_STRIDE*6 + 8]
//	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
	 punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
	
	movq mm3, qword ptr [esi + ROW_STRIDE*7 + 8]
	 movq mm6, mm2;

	punpcklwd mm2, mm3;	// mm2 = [ 8 12 9 13]
	 movq mm1, mm0;	// mm1 = [ 0 4 1 5]

	punpckhwd mm6, mm3;	// mm6 = 10 14 11 15]
	 movq mm3, mm4;	// mm3 = [ 2 6 3 7]

	punpckldq mm0, mm2;	// final result mm0 = row1 [0 4 8 12]

	punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
	; // slot

	movq qword ptr [ esi + ROW_STRIDE*4 + 8], mm0; // store row 1
	 punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

	movq mm0, qword ptr [esi + ROW_STRIDE * 4 ]
	 punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
	movq qword ptr [ esi +ROW_STRIDE*6 + 8], mm4; // store row 3
	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]

	movq qword ptr [ esi +ROW_STRIDE*5 + 8], mm1; // store row 2
	 ; // slot
	movq mm1, qword ptr [esi + ROW_STRIDE * 5 ]
	 ; // slot

	movq qword ptr [ esi +ROW_STRIDE*7 + 8], mm3; // store row 4
	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

  // 3) transpose lower-left
	 // this segment will store the output in SCRATCH registers
//	movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]

//	movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
//	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
	
	movq mm2, qword ptr [esi + ROW_STRIDE * 6 ]
//	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
	 punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
	
	movq mm3, qword ptr [esi + ROW_STRIDE * 7 ]
	 movq mm6, mm2;

	punpcklwd mm2, mm3;	// mm2 = [ 8 12 9 13]
	 movq mm1, mm0;	// mm1 = [ 0 4 1 5]

	punpckhwd mm6, mm3;	// mm6 = 10 14 11 15]
	 movq mm3, mm4;	// mm3 = [ 2 6 3 7]

	punpckldq mm0, mm2;	// final result mm0 = row1 [0 4 8 12]

	punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
	 ;//slot

	movq qword ptr [ scratch1 ], mm0; // store row 1
	 punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (upper-right)
	movq mm0, qword ptr [esi+ ROW_STRIDE*0 + 8]; 
	 punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

	movq qword ptr [ scratch5 ], mm4; // store row 3
	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]

	movq qword ptr [ scratch3 ], mm1; // store row 2
	movq mm1, qword ptr [esi + ROW_STRIDE*1 + 8]

	movq qword ptr [ scratch7 ], mm3; // store row 4
	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]


	// 2) transpose lower-right quadrant

//	movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//	movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
	
	movq mm2, qword ptr [esi + ROW_STRIDE*2 + 8]
//	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
	 punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
	
	movq mm3, qword ptr [esi + ROW_STRIDE*3 + 8]
	 movq mm6, mm2;

	punpcklwd mm2, mm3;	// mm2 = [ 8 12 9 13]
	 movq mm1, mm0;	// mm1 = [ 0 4 1 5]

	punpckhwd mm6, mm3;	// mm6 = 10 14 11 15]
	 movq mm3, mm4;	// mm3 = [ 2 6 3 7]

	movq mm7, qword ptr [ scratch1 ];	// load row1[A B C D]
	 punpckldq mm0, mm2;	// final result mm0 = row1q [0 4 8 12]

	movq mm5, qword ptr [ scratch3 ];	// load row2q [A B C D]
	 punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]

	movq qword ptr [ esi + ROW_STRIDE*4 ], mm0; // store row 1
	 punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

	movq qword ptr [ esi +ROW_STRIDE*5 ], mm1; // store row 2
	 punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

	movq qword ptr [ esi +ROW_STRIDE*6 ], mm4; // store row 3
	; // slot
	movq mm0, qword ptr [ scratch5 ]; // load row 3q
	; // slot
	movq qword ptr [ esi +ROW_STRIDE*7 ], mm3; // store row 4
	 ; // slot
	movq mm1, qword ptr [ scratch7 ]; // load row 4q
	 ; // slot
	movq qword ptr [ esi + ROW_STRIDE*0 + 8], mm7; // store row 1q

	movq qword ptr [ esi + ROW_STRIDE*1 + 8], mm5; // store row 1q

	movq qword ptr [ esi + ROW_STRIDE*2 + 8], mm0; // store row 1q

	movq qword ptr [ esi + ROW_STRIDE*3 + 8], mm1; // store row 1q

	// done with transpose operation

	// emms
	

	} // end __asm
	// transpose

	// the above MMX-code is equivalent to the simple C-loop below

/*	int i,j;
	short temp;

	for ( i = 1; i < 8; ++i)
		for ( j = 0; j < i; ++j )
		{
			// swap elements matrix elements #(i,j) and #(j,i)
			temp = src_result[ i + (j<<3) ];
			src_result[ i + (j<<3) ] = src_result[ j + (i<<3) ];
			src_result[ j + (i<<3) ] = temp;
		}
	*/
}
idct_ap528mmx.cpp - 源码说明

本页面展示了「这是一组DCT和iDCT的代码」中的 idct_ap528mmx.cpp 源码文件，采用 C++ 编程语言编写，共 1,169 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与iDCT相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?