Commit 0d88274d authored by Loren Merritt's avatar Loren Merritt

faster intra search: filter i8x8 edges only once, and reuse for multiple predictions.



git-svn-id: svn://svn.videolan.org/x264/trunk@520 df754926-b1dd-0310-bc7b-ec298dee348c
parent 3de28cd5
......@@ -48,7 +48,7 @@ cglobal x264_pixel_satd_8x16_sse2
cglobal x264_pixel_satd_16x16_sse2
cglobal x264_pixel_sa8d_8x8_sse2
cglobal x264_pixel_sa8d_16x16_sse2
cglobal x264_intra_sa8d_x3_8x8_sse2
cglobal x264_intra_sa8d_x3_8x8_core_sse2
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [rdx]
......@@ -627,16 +627,21 @@ x264_pixel_sa8d_16x16_sse2:
%macro LOAD_HADAMARD8 1
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
x264_intra_sa8d_x3_8x8_core_sse2:
; 8x8 hadamard
pxor xmm4, xmm4
movq xmm0, [%1+0*FENC_STRIDE]
movq xmm7, [%1+1*FENC_STRIDE]
movq xmm6, [%1+2*FENC_STRIDE]
movq xmm3, [%1+3*FENC_STRIDE]
movq xmm5, [%1+4*FENC_STRIDE]
movq xmm1, [%1+5*FENC_STRIDE]
movq xmm8, [%1+6*FENC_STRIDE]
movq xmm2, [%1+7*FENC_STRIDE]
movq xmm0, [parm1q+0*FENC_STRIDE]
movq xmm7, [parm1q+1*FENC_STRIDE]
movq xmm6, [parm1q+2*FENC_STRIDE]
movq xmm3, [parm1q+3*FENC_STRIDE]
movq xmm5, [parm1q+4*FENC_STRIDE]
movq xmm1, [parm1q+5*FENC_STRIDE]
movq xmm8, [parm1q+6*FENC_STRIDE]
movq xmm2, [parm1q+7*FENC_STRIDE]
punpcklbw xmm0, xmm4
punpcklbw xmm7, xmm4
punpcklbw xmm6, xmm4
......@@ -648,128 +653,11 @@ x264_pixel_sa8d_16x16_sse2:
HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
%endmacro
%macro SCALAR_SUMSUB 4
add %1, %2
add %3, %4
add %2, %2
add %4, %4
sub %2, %1
sub %4, %3
%endmacro
%macro SCALAR_HADAMARD1x8 9 ; 8x tmp, dst
SCALAR_SUMSUB %1, %5, %2, %6
SCALAR_SUMSUB %3, %7, %4, %8
SCALAR_SUMSUB %1, %3, %2, %4
SCALAR_SUMSUB %5, %7, %6, %8
SCALAR_SUMSUB %1, %2, %3, %4
SCALAR_SUMSUB %5, %6, %7, %8
mov [%9+0], %1
mov [%9+2], %2
mov [%9+4], %3
mov [%9+6], %4
mov [%9+8], %5
mov [%9+10], %6
mov [%9+12], %7
mov [%9+14], %8
%endmacro
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 5
movq %5, %2
pavgb %2, %3
pxor %3, %5
movq %1, %4
pand %3, [pb_1 GLOBAL]
psubusb %2, %3
pavgb %1, %2
%endmacro
; output: mm0 = filtered t0..t7
; assumes topleft is available
%macro PRED8x8_LOAD_TOP_FILT 1
movq mm1, [%1-1]
movq mm2, [%1+1]
and parm4d, byte 4
jne .have_topright
mov al, [%1+7]
mov ah, al
pinsrw mm2, eax, 3
.have_topright:
PRED8x8_LOWPASS mm0, mm1, mm2, [%1], mm7
%endmacro
%macro PRED8x8_LOAD_LEFT_FILT 10 ; 8x reg, tmp, src
movzx %1, byte [%10-1*FDEC_STRIDE]
movzx %2, byte [%10+0*FDEC_STRIDE]
movzx %3, byte [%10+1*FDEC_STRIDE]
movzx %4, byte [%10+2*FDEC_STRIDE]
movzx %5, byte [%10+3*FDEC_STRIDE]
movzx %6, byte [%10+4*FDEC_STRIDE]
movzx %7, byte [%10+5*FDEC_STRIDE]
movzx %8, byte [%10+6*FDEC_STRIDE]
movzx %9, byte [%10+7*FDEC_STRIDE]
lea %1, [%1+%2+1]
lea %2, [%2+%3+1]
lea %3, [%3+%4+1]
lea %4, [%4+%5+1]
lea %5, [%5+%6+1]
lea %6, [%6+%7+1]
lea %7, [%7+%8+1]
lea %8, [%8+%9+1]
lea %9, [%9+%9+1]
add %1, %2
add %2, %3
add %3, %4
add %4, %5
add %5, %6
add %6, %7
add %7, %8
add %8, %9
shr %1, 2
shr %2, 2
shr %3, 2
shr %4, 2
shr %5, 2
shr %6, 2
shr %7, 2
shr %8, 2
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t *fdec,
; int *res, int i_neighbors )
;-----------------------------------------------------------------------------
x264_intra_sa8d_x3_8x8_sse2:
%define left_1d rsp-16 ; +16
%define top_1d rsp-32 ; +16
push rbx
push r12
push r13
push r14
push r15
LOAD_HADAMARD8 parm1q
PRED8x8_LOAD_LEFT_FILT r8, r9, r10, r11, r12, r13, r14, r15, rax, parm2q-1
SCALAR_HADAMARD1x8 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d, left_1d
mov edi, r8d ; dc
PRED8x8_LOAD_TOP_FILT parm2q-FDEC_STRIDE
movq [top_1d], mm0
movzx r8d, byte [top_1d+0]
movzx r9d, byte [top_1d+1]
movzx r10d, byte [top_1d+2]
movzx r11d, byte [top_1d+3]
movzx r12d, byte [top_1d+4]
movzx r13d, byte [top_1d+5]
movzx r14d, byte [top_1d+6]
movzx r15d, byte [top_1d+7]
SCALAR_HADAMARD1x8 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w, top_1d
lea rdi, [rdi + r8 + 8] ; dc
; dc
movzx edi, word [parm2q+0]
add di, word [parm2q+16]
add edi, 8
and edi, -16
shl edi, 2
......@@ -786,7 +674,7 @@ x264_intra_sa8d_x3_8x8_sse2:
SUM1x8_SSE2 xmm8, xmm10, xmm15
movdqa xmm14, xmm15 ; 7x8 sum
movdqa xmm8, [left_1d] ; left edge
movdqa xmm8, [parm2q+0] ; left edge
movd xmm9, edi
psllw xmm8, 3
psubw xmm8, xmm0
......@@ -800,7 +688,7 @@ x264_intra_sa8d_x3_8x8_sse2:
punpckldq xmm0, xmm2
punpckldq xmm4, xmm6
punpcklqdq xmm0, xmm4 ; transpose
movdqa xmm1, [top_1d]
movdqa xmm1, [parm2q+16] ; top edge
movdqa xmm2, xmm15
psllw xmm1, 3
psrldq xmm2, 2 ; 8x7 sum
......@@ -820,9 +708,4 @@ x264_intra_sa8d_x3_8x8_sse2:
shr eax, 2
mov [parm3q+0], eax ; i8x8_v sa8d
pop r15
pop r14
pop r13
pop r12
pop rbx
ret
......@@ -29,14 +29,14 @@ BITS 64
%include "amd64inc.asm"
%macro STORE8x8 2
movq [parm1q + 0*FDEC_STRIDE], %1
movq [parm1q + 1*FDEC_STRIDE], %1
movq [parm1q + 2*FDEC_STRIDE], %1
movq [parm1q + 3*FDEC_STRIDE], %1
movq [parm1q + 4*FDEC_STRIDE], %1
movq [parm1q + 4*FDEC_STRIDE], %2
movq [parm1q + 5*FDEC_STRIDE], %2
movq [parm1q + 6*FDEC_STRIDE], %2
movq [parm1q + 7*FDEC_STRIDE], %2
movq [parm1q + 8*FDEC_STRIDE], %2
%endmacro
%macro STORE16x16 2
......@@ -62,14 +62,15 @@ SECTION .rodata align=16
ALIGN 16
pw_2: times 4 dw 2
pw_4: times 4 dw 4
pw_8: times 4 dw 8
pb_1: times 16 db 1
pw_3210:
dw 0
dw 1
dw 2
dw 3
ALIGN 16
pb_1: times 16 db 1
pb_00s_ff:
times 8 db 0
pb_0s_ff:
......@@ -85,12 +86,14 @@ SECTION .text
cglobal predict_4x4_ddl_mmxext
cglobal predict_4x4_vl_mmxext
cglobal predict_8x8_v_mmxext
cglobal predict_8x8_dc_mmxext
cglobal predict_8x8_dc_top_mmxext
cglobal predict_8x8_dc_left_mmxext
cglobal predict_8x8_ddl_mmxext
cglobal predict_8x8_ddl_sse2
cglobal predict_8x8_ddr_sse2
cglobal predict_8x8_vl_sse2
cglobal predict_8x8_vr_core_mmxext
cglobal predict_8x8_dc_core_mmxext
cglobal predict_8x8c_v_mmx
cglobal predict_8x8c_dc_core_mmxext
cglobal predict_8x8c_p_core_mmxext
......@@ -118,76 +121,9 @@ cglobal predict_16x16_dc_top_mmxext
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
%endmacro
; output: mm0 = filtered t0..t7
%macro PRED8x8_LOAD_TOP_FILT 0
sub parm1q, FDEC_STRIDE
and parm2d, 12
movq mm1, [parm1q-1]
movq mm2, [parm1q+1]
cmp parm2d, byte 8
jge .have_topleft
mov al, [parm1q]
mov ah, al
pinsrw mm1, eax, 0
.have_topleft:
and parm2d, byte 4
jne .have_topright
mov al, [parm1q+7]
mov ah, al
pinsrw mm2, eax, 3
.have_topright:
PRED8x8_LOWPASS mm0, mm1, mm2, [parm1q], mm7
%endmacro
; output: xmm0 = unfiltered t0..t15
; xmm1 = unfiltered t1..t15
; xmm2 = unfiltered tl..t14
%macro PRED8x8_LOAD_TOP_TOPRIGHT_XMM 0
sub parm1q, FDEC_STRIDE
and parm2d, 12
movdqu xmm1, [parm1q-1]
cmp parm2d, byte 8
jge .have_topleft
mov al, [parm1q]
mov ah, al
pinsrw xmm1, eax, 0
.have_topleft:
and parm2d, byte 4
jne .have_topright
mov al, [parm1q+7]
mov ah, al
pinsrw xmm1, eax, 4
pshufhw xmm1, xmm1, 0
movdqa xmm0, xmm1
movdqa xmm2, xmm1
psrldq xmm0, 1
psrldq xmm2, 2
pshufhw xmm0, xmm0, 0
pshufhw xmm2, xmm2, 0
jmp .done_topright
.have_topright:
movdqu xmm0, [parm1q]
movdqa xmm2, xmm0
psrldq xmm2, 1
mov al, [parm1q+15]
mov ah, al
pinsrw xmm2, eax, 7
.done_topright:
%endmacro
;-----------------------------------------------------------------------------
;
; void predict_4x4_ddl_mmxext( uint8_t *src )
;
;-----------------------------------------------------------------------------
ALIGN 16
......@@ -213,9 +149,7 @@ predict_4x4_ddl_mmxext:
ret
;-----------------------------------------------------------------------------
;
; void predict_4x4_vl_mmxext( uint8_t *src )
;
;-----------------------------------------------------------------------------
ALIGN 16
......@@ -240,211 +174,127 @@ predict_4x4_vl_mmxext:
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_v_mmxext( uint8_t *src, int i_neighbors )
;
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_v_mmxext:
PRED8x8_LOAD_TOP_FILT
movq mm0, [parm2q+16]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_neighbors, uint8_t *pix_left );
;
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_dc_core_mmxext:
movq mm1, [parm3q-1]
movq mm2, [parm3q+1]
PRED8x8_LOWPASS mm4, mm1, mm2, [parm3q], mm7
PRED8x8_LOAD_TOP_FILT
predict_8x8_dc_mmxext:
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, mm1
psadbw mm4, mm1
psadbw mm0, [parm2q+7]
psadbw mm1, [parm2q+16]
paddw mm0, [pw_8 GLOBAL]
paddw mm0, mm4
paddw mm0, mm1
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_dc_top_mmxext:
pxor mm0, mm0
psadbw mm0, [parm2q+16]
paddw mm0, [pw_4 GLOBAL]
psrlw mm0, 3
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_dc_left_mmxext:
pxor mm0, mm0
psadbw mm0, [parm2q+7]
paddw mm0, [pw_4 GLOBAL]
psrlw mm0, 3
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddl_mmxext( uint8_t *src, int i_neighbors )
;
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddl_mmxext:
sub parm1q, FDEC_STRIDE
movq mm5, [parm2q+16]
movq mm2, [parm2q+17]
movq mm3, [parm2q+23]
movq mm4, [parm2q+25]
movq mm1, mm5
psllq mm1, 8
PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
PRED8x8_LOWPASS mm1, mm3, mm4, [parm2q+24], mm6
and parm2d, 12
movq mm1, [parm1q-1]
movq mm2, [parm1q+1]
cmp parm2d, byte 8
jge .have_topleft
mov al, [parm1q]
mov ah, al
pinsrw mm1, eax, 0
.have_topleft:
and parm2d, byte 4
jne .have_topright
mov al, [parm1q+7]
mov ah, [parm1q+7]
pinsrw mm2, eax, 3
pshufw mm3, mm2, 0xff
jmp .done_topright
.have_topright:
movq mm5, [parm1q+9];
mov al, [parm1q+15]
mov ah, al
pinsrw mm5, eax, 3
movq mm4, [parm1q+7];
PRED8x8_LOWPASS mm3, mm4, mm5, [parm1q+8], mm7
.done_topright:
;?0123456789abcdeff
; [-mm0--][-mm3--]
;[-mm1--][-mm4--]
; [-mm2--][-mm5--]
PRED8x8_LOWPASS mm0, mm1, mm2, [parm1q], mm7
movq mm1, mm0
%assign Y 7
%rep 6
movq [parm1q+Y*FDEC_STRIDE], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 8
movq mm6, mm3
movq mm4, mm3
psllq mm6, 56
movq mm7, mm0
por mm2, mm6
psllq mm4, 8
movq mm5, mm3
movq mm6, mm3
psrlq mm5, 8
pand mm6, [pb_0s_ff GLOBAL]
psrlq mm7, 56
por mm5, mm6
por mm4, mm7
PRED8x8_LOWPASS mm6, mm1, mm2, mm0, mm7
PRED8x8_LOWPASS mm7, mm4, mm5, mm3, mm2
%assign Y 8
%rep 6
movq [parm1q+Y*FDEC_STRIDE], mm7
movq mm1, mm6
psllq mm7, 8
psrlq mm1, 56
psllq mm6, 8
por mm7, mm1
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
%assign Y (Y-1)
%endrep
movq [parm1q+Y*FDEC_STRIDE], mm7
psllq mm7, 8
psrlq mm6, 56
por mm7, mm6
movq [parm1q+Y*FDEC_STRIDE], mm1
psllq mm1, 8
psrlq mm0, 56
por mm1, mm0
%assign Y (Y-1)
movq [parm1q+Y*FDEC_STRIDE], mm7
movq [parm1q+Y*FDEC_STRIDE], mm1
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddl_sse2( uint8_t *src, int i_neighbors )
;
; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddl_sse2:
PRED8x8_LOAD_TOP_TOPRIGHT_XMM
;?0123456789abcdeff
; [-----xmm0-----]
;[-----xmm1-----]
; [-----xmm2-----]
movdqa xmm3, [pb_00s_ff GLOBAL]
PRED8x8_LOWPASS_XMM xmm4, xmm1, xmm2, xmm0, xmm5
movdqa xmm1, xmm4
movdqa xmm2, xmm4
pand xmm3, xmm4
psrldq xmm2, 1
movdqa xmm3, [parm2q+16]
movdqu xmm2, [parm2q+17]
movdqa xmm1, xmm3
pslldq xmm1, 1
por xmm2, xmm3
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
%assign Y 1
%assign Y 0
%rep 8
psrldq xmm0, 1
movq [parm1q+Y*FDEC_STRIDE], xmm0
%assign Y (Y+1)
%endrep
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddr_sse2( uint8_t *src, int i_neighbors )
;
; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddr_sse2:
lea r8, [rsp-24]
movq mm0, [parm1q-FDEC_STRIDE]
movq [r8+8], mm0
and parm2d, byte 4
mov al, [parm1q-FDEC_STRIDE+7]
cmovnz ax, [parm1q-FDEC_STRIDE+8]
mov [r8+16], al
mov dh, [parm1q+3*FDEC_STRIDE-1]
mov dl, [parm1q+4*FDEC_STRIDE-1]
mov ah, [parm1q-1*FDEC_STRIDE-1]
mov al, [parm1q+0*FDEC_STRIDE-1]
shl edx, 16
shl eax, 16
mov dh, [parm1q+5*FDEC_STRIDE-1]
mov dl, [parm1q+6*FDEC_STRIDE-1]
mov ah, [parm1q+1*FDEC_STRIDE-1]
mov al, [parm1q+2*FDEC_STRIDE-1]
mov [r8+4], eax
mov [r8], edx
movzx eax, byte [parm1q+7*FDEC_STRIDE-1]
movd xmm4, eax
movzx edx, dl
lea eax, [rax+2*rax+2]
add eax, edx
shr eax, 2
movd xmm5, eax
; r8 -> {l6 l5 l4 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t8}
movdqu xmm0, [r8]
movdqu xmm2, [r8+1]
movdqa xmm1, xmm0
pslldq xmm1, 1
por xmm1, xmm4
PRED8x8_LOWPASS_XMM xmm3, xmm1, xmm2, xmm0, xmm4
movdqa xmm1, xmm3
movdqu xmm3, [parm2q+8]
movdqu xmm1, [parm2q+7]
movdqa xmm2, xmm3
pslldq xmm1, 1
psrldq xmm2, 1
por xmm1, xmm5
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm1, xmm0
......@@ -452,8 +302,8 @@ predict_8x8_ddr_sse2:
%assign Y 7
%rep 3
movq [parm1q+Y*FDEC_STRIDE], xmm0
psrldq xmm0, 2
movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1
psrldq xmm0, 2
psrldq xmm1, 2
%assign Y (Y-2)
%endrep
......@@ -463,15 +313,12 @@ predict_8x8_ddr_sse2:
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_vl_sse2( uint8_t *src, int i_neighbors )
;
; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_vl_sse2:
PRED8x8_LOAD_TOP_TOPRIGHT_XMM
PRED8x8_LOWPASS_XMM xmm4, xmm1, xmm2, xmm0, xmm5
movdqa xmm4, [parm2q+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
movdqa xmm3, xmm4
......@@ -482,7 +329,7 @@ predict_8x8_vl_sse2:
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
; xmm3: (t0 + t1 + 1) >> 1
%assign Y 1
%assign Y 0
%rep 3
psrldq xmm0, 1
movq [parm1q+ Y *FDEC_STRIDE], xmm3
......@@ -497,84 +344,59 @@ predict_8x8_vl_sse2:
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_vr_core_mmxext( uint8_t *src, int i_neighbors, uint16_t ltt0 )
;
; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
; f0123456789abcdef
; 0 .......
; 1 ,,,,,,
; 2 ......
; 3 ,,,,,
; 4 .....
; 5 ,,,,
; 6 ....
; 7 ,,,
; f01234567
; 0........
; 1,,,,,,,,
; 2 .......
; 3 ,,,,,,,
; 4 ......
; 5 ,,,,,,
; 6 .....
; 7 ,,,,,
ALIGN 16
predict_8x8_vr_core_mmxext:
sub parm1q, FDEC_STRIDE
movq mm1, [parm1q-1]
movq mm2, [parm1q+1]
and parm2d, byte 4
jne .have_topright
mov al, [parm1q+7]
mov ah, al
pinsrw mm2, eax, 3
.have_topright:
PRED8x8_LOWPASS mm4, mm1, mm2, [parm1q], mm7
movq mm1, mm4
movq mm2, mm4
psllq mm1, 8
movq mm3, mm4
pinsrw mm1, parm3d, 0
psrlq mm2, 8
pavgb mm3, mm1
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm5
movq mm2, [parm2q+16]
movq mm3, [parm2q+15]
movq mm1, [parm2q+14]
movq mm4, mm3
pavgb mm3, mm2
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
%assign Y 1
%assign Y 0
%rep 3
psllq mm0, 8
movq [parm1q+ Y *FDEC_STRIDE], mm3
movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
psllq mm3, 8
psllq mm0, 8
%assign Y (Y+2)
%endrep
psllq mm0, 8
movq [parm1q+ Y *FDEC_STRIDE], mm3
movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_v_mmx( uint8_t *src )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_v_mmx :
sub parm1q, FDEC_STRIDE
movq mm0,