📄 transferidct_mmx.c
字号:
#ifdef _TEST_TRANSFER
#include <stdio.h>
#endif
#include "portab.h"
void transferIDCT_add(int16_t *sourceS16, uint8_t *destU8, int stride) {
#ifdef _TEST_TRANSFER
uint8_t reference_dest[64];
int x, y, sum16;
for (y=0; y<8; y++) {
for (x=0; x<8; x++) {
sum16 = (destU8[stride*y + x] + sourceS16[8*y + x]);
if (sum16 > 255) reference_dest[8*y + x] = 255;
else if (sum16 < 0) reference_dest[8*y + x] = 0;
else reference_dest[8*y + x] = (uint8_t)sum16;
}
}
#endif
_asm {
; not sure about the state handling here - there must be a better way
push eax
push ebx
push edi
mov eax, sourceS16 ; parameter 1, *sourceS16
mov ebx, destU8 ; parameter 2, *destU8
mov edi, stride ; parameter 3, stride
pxor mm7, mm7 ; set mm7 = 0
; lines 0 to 7 all scheduled in together
movq mm0, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm1, mm0 ; eight bytes of destination into mm0
punpcklbw mm0, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm1, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm0, qword ptr [eax] ; add source and destination
paddsw mm1, qword ptr [eax+8]; add source and destination
packuswb mm0, mm1 ; pack mm0 and mm1 into mm0
movq qword ptr [ebx], mm0 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm2, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm3, mm2 ; eight bytes of destination into mm3
punpcklbw mm2, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm3, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm2, qword ptr [eax+16] ; add source and destination
paddsw mm3, qword ptr [eax+24]; add source and destination
packuswb mm2, mm3 ; pack mm0 and mm1 into mm0
movq qword ptr [ebx], mm2 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm4, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm5, mm4 ; eight bytes of destination into mm5
punpcklbw mm4, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm5, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm4, qword ptr [eax+32] ; add source and destination
paddsw mm5, qword ptr [eax+40]; add source and destination
packuswb mm4, mm5 ; pack mm0 and mm1 into mm0
movq qword ptr [ebx], mm4 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm0, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm1, qword ptr [ebx] ; eight bytes of destination into mm5
punpcklbw mm0, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm1, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm0, qword ptr [eax+48] ; add source and destination
paddsw mm1, qword ptr [eax+56]; add source and destination
packuswb mm0, mm1 ; pack mm0 and mm1 into mm0
add eax, 64 ; add +64 to source ptr
movq qword ptr [ebx], mm0 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm2, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm3, mm2 ; eight bytes of destination into mm3
punpcklbw mm2, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm3, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm2, qword ptr [eax] ; add source and destination
paddsw mm3, qword ptr [eax+8]; add source and destination
packuswb mm2, mm3 ; pack mm0 and mm1 into mm0
add eax, 16 ; add +16 to source ptr
movq qword ptr [ebx], mm2 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm4, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm5, mm4 ; eight bytes of destination into mm5
punpcklbw mm4, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm5, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm4, qword ptr [eax] ; add source and destination
paddsw mm5, qword ptr [eax+8]; add source and destination
packuswb mm4, mm5 ; pack mm0 and mm1 into mm0
add eax, 16 ; add +16 to source ptr
movq qword ptr [ebx], mm4 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm0, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm1, mm0 ; eight bytes of destination into mm1
punpcklbw mm0, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm1, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm0, qword ptr [eax] ; add source and destination
paddsw mm1, qword ptr [eax+8]; add source and destination
packuswb mm0, mm1 ; pack mm0 and mm1 into mm0
add eax, 16 ; add +16 to source ptr
movq qword ptr [ebx], mm0 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm2, qword ptr [ebx] ; eight bytes of destination into mm4
movq mm3, mm2 ; eight bytes of destination into mm3
punpcklbw mm2, mm7 ; unpack first 4 bytes from dest into mm4
punpckhbw mm3, mm7 ; unpack next 4 bytes from dest into mm5
paddsw mm2, qword ptr [eax] ; add source and destination
paddsw mm3, qword ptr [eax+8]; add source and destination
packuswb mm2, mm3 ; pack mm0 and mm1 into mm0
movq qword ptr [ebx], mm2 ; copy output to destination
pop edi
pop ebx
pop eax
emms
}
#ifdef _TEST_TRANSFER
for (y=0; y<8; y++) {
for (x=0; x<8; x++) {
if (reference_dest[8*y + x] != destU8[stride*y + x]) printf("transferIDCT_add() is broken\n");
}
}
#endif
}
void transferIDCT_copy(int16_t *sourceS16, uint8_t *destU8, int stride) {
#ifdef _TEST_TRANSFER
int x, y, clipped;
#endif
_asm {
; not sure about the state handling here - there must be a better way
push eax
push ebx
push edi
mov eax, sourceS16 ; parameter 1, *sourceS16
mov ebx, destU8 ; parameter 2, *destU8
mov edi, stride ; parameter 3, stride
; lines 0 to 7 schedueled into each other...
movq mm0, qword ptr [eax] ; move first four words into mm0
packuswb mm0, qword ptr [eax+8] ; pack mm0 and the next four words into mm0
movq mm1, qword ptr [eax+16] ; move first four words into mm1
packuswb mm1, qword ptr [eax+24]; pack mm0 and the next four words into mm1
movq mm2, qword ptr [eax+32] ; move first four words into mm2
packuswb mm2, qword ptr [eax+40]; pack mm0 and the next four words into mm2
movq mm3, qword ptr [eax+48] ; move first four words into mm3
packuswb mm3, qword ptr [eax+56] ; pack mm3 and the next four words into mm3
movq qword ptr [ebx], mm0 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm1 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm2 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm3 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq mm0, qword ptr [eax+64] ; move first four words into mm0
add eax, 64 ; add 64 to source ptr
packuswb mm0, qword ptr [eax+8] ; pack mm0 and the next four words into mm0
movq mm1, qword ptr [eax+16] ; move first four words into mm1
packuswb mm1, qword ptr [eax+24]; pack mm0 and the next four words into mm1
movq mm2, qword ptr [eax+32] ; move first four words into mm2
packuswb mm2, qword ptr [eax+40]; pack mm0 and the next four words into mm2
movq mm3, qword ptr [eax+48] ; move first four words into mm3
packuswb mm3, qword ptr [eax+56]; pack mm3 and the next four words into mm3
movq qword ptr [ebx], mm0 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm1 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm2 ; copy output to destination
add ebx, edi ; add +stride to dest ptr
movq qword ptr [ebx], mm3 ; copy output to destination
pop edi
pop ebx
pop eax
emms
}
#ifdef _TEST_TRANSFER
for (y=0; y<8; y++) {
for (x=0; x<8; x++) {
clipped = sourceS16[8*y + x];
if (clipped > 255) clipped = 255;
if (clipped < 0) clipped = 0;
if (clipped != destU8[stride*y+x]) printf("transferIDCT_copy() is broken\n");
}
}
#endif
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -