Commit ef4a8d2e authored by Daniel Kang's avatar Daniel Kang Committed by Fiona Glaser

Tons of high bit depth intra predict asm

Patch from Google Code-In.
parent c801fc6c
......@@ -452,21 +452,21 @@ INIT_XMM
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
;-----------------------------------------------------------------------------
%macro ADD_DC 2
mova m0, [%1+SIZEOF_PIXEL*FDEC_STRIDE*0] ; 8pixels
mova m1, [%1+SIZEOF_PIXEL*FDEC_STRIDE*1]
mova m2, [%1+SIZEOF_PIXEL*FDEC_STRIDE*2]
mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
mova m1, [%1+FDEC_STRIDEB*1]
mova m2, [%1+FDEC_STRIDEB*2]
paddsw m0, %2
paddsw m1, %2
paddsw m2, %2
paddsw %2, [%1+SIZEOF_PIXEL*FDEC_STRIDE*3]
paddsw %2, [%1+FDEC_STRIDEB*3]
CLIPW m0, m5, m6
CLIPW m1, m5, m6
CLIPW m2, m5, m6
CLIPW %2, m5, m6
mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*0], m0
mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*1], m1
mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*2], m2
mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*3], %2
mova [%1+FDEC_STRIDEB*0], m0
mova [%1+FDEC_STRIDEB*1], m1
mova [%1+FDEC_STRIDEB*2], m2
mova [%1+FDEC_STRIDEB*3], %2
%endmacro
INIT_XMM
......@@ -480,8 +480,8 @@ cglobal add8x8_idct_dc_sse2, 2,2,7
pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*0, m4
ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*4, m3
ADD_DC r0+FDEC_STRIDEB*0, m4
ADD_DC r0+FDEC_STRIDEB*4, m3
RET
cglobal add16x16_idct_dc_sse2, 2,3,8
......@@ -497,10 +497,10 @@ cglobal add16x16_idct_dc_sse2, 2,3,8
pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*0, m4
ADD_DC r0+FDEC_STRIDEB*0, m4
ADD_DC r0+SIZEOF_PIXEL*8, m3
add r1, 16
add r0, 4*FDEC_STRIDE*SIZEOF_PIXEL
add r0, 4*FDEC_STRIDEB
dec r2
jg .loop
REP_RET
......
......@@ -38,8 +38,10 @@ pb_0s_ff: times 7 db 0
SECTION .text
cextern pb_0
cextern pb_1
cextern pb_3
cextern pw_1
cextern pw_2
cextern pw_4
cextern pw_8
......@@ -47,70 +49,103 @@ cextern pw_ff00
cextern pb_reverse
%macro STORE8x8 2
add r0, 4*FDEC_STRIDE
movq [r0 + -4*FDEC_STRIDE], %1
movq [r0 + -3*FDEC_STRIDE], %1
movq [r0 + -2*FDEC_STRIDE], %1
movq [r0 + -1*FDEC_STRIDE], %1
movq [r0 + 0*FDEC_STRIDE], %2
movq [r0 + 1*FDEC_STRIDE], %2
movq [r0 + 2*FDEC_STRIDE], %2
movq [r0 + 3*FDEC_STRIDE], %2
add r0, 4*FDEC_STRIDEB
mova [r0 + -4*FDEC_STRIDEB], %1
mova [r0 + -3*FDEC_STRIDEB], %1
mova [r0 + -2*FDEC_STRIDEB], %1
mova [r0 + -1*FDEC_STRIDEB], %1
mova [r0 + 0*FDEC_STRIDEB], %2
mova [r0 + 1*FDEC_STRIDEB], %2
mova [r0 + 2*FDEC_STRIDEB], %2
mova [r0 + 3*FDEC_STRIDEB], %2
%endmacro
%macro STORE16x16 2
%macro STORE16x16 2-4
%ifidn %0, 4
mov r1d, 8
.loop:
mova [r0 + 0*FDEC_STRIDEB + 0], %1
mova [r0 + 1*FDEC_STRIDEB + 0], %1
mova [r0 + 0*FDEC_STRIDEB + 8], %2
mova [r0 + 1*FDEC_STRIDEB + 8], %2
mova [r0 + 0*FDEC_STRIDEB +16], %3
mova [r0 + 1*FDEC_STRIDEB +16], %3
mova [r0 + 0*FDEC_STRIDEB +24], %4
mova [r0 + 1*FDEC_STRIDEB +24], %4
add r0, 2*FDEC_STRIDEB
dec r1d
jg .loop
%else
mov r1d, 4
.loop:
movq [r0 + 0*FDEC_STRIDE], %1
movq [r0 + 1*FDEC_STRIDE], %1
movq [r0 + 2*FDEC_STRIDE], %1
movq [r0 + 3*FDEC_STRIDE], %1
movq [r0 + 0*FDEC_STRIDE + 8], %2
movq [r0 + 1*FDEC_STRIDE + 8], %2
movq [r0 + 2*FDEC_STRIDE + 8], %2
movq [r0 + 3*FDEC_STRIDE + 8], %2
mova [r0 + 0*FDEC_STRIDE], %1
mova [r0 + 1*FDEC_STRIDE], %1
mova [r0 + 2*FDEC_STRIDE], %1
mova [r0 + 3*FDEC_STRIDE], %1
mova [r0 + 0*FDEC_STRIDE + 8], %2
mova [r0 + 1*FDEC_STRIDE + 8], %2
mova [r0 + 2*FDEC_STRIDE + 8], %2
mova [r0 + 3*FDEC_STRIDE + 8], %2
add r0, 4*FDEC_STRIDE
dec r1d
jg .loop
%endif
%endmacro
%macro STORE16x16_SSE2 1
add r0, 4*FDEC_STRIDE
movdqa [r0 + -4*FDEC_STRIDE], %1
movdqa [r0 + -3*FDEC_STRIDE], %1
movdqa [r0 + -2*FDEC_STRIDE], %1
movdqa [r0 + -1*FDEC_STRIDE], %1
movdqa [r0 + 0*FDEC_STRIDE], %1
movdqa [r0 + 1*FDEC_STRIDE], %1
movdqa [r0 + 2*FDEC_STRIDE], %1
movdqa [r0 + 3*FDEC_STRIDE], %1
add r0, 8*FDEC_STRIDE
movdqa [r0 + -4*FDEC_STRIDE], %1
movdqa [r0 + -3*FDEC_STRIDE], %1
movdqa [r0 + -2*FDEC_STRIDE], %1
movdqa [r0 + -1*FDEC_STRIDE], %1
movdqa [r0 + 0*FDEC_STRIDE], %1
movdqa [r0 + 1*FDEC_STRIDE], %1
movdqa [r0 + 2*FDEC_STRIDE], %1
movdqa [r0 + 3*FDEC_STRIDE], %1
%macro STORE16x16_SSE2 1-2
%ifidn %0,2
mov r1d, 4
.loop
mova [r0+0*FDEC_STRIDEB+ 0], %1
mova [r0+0*FDEC_STRIDEB+16], %2
mova [r0+1*FDEC_STRIDEB+ 0], %1
mova [r0+1*FDEC_STRIDEB+16], %2
mova [r0+2*FDEC_STRIDEB+ 0], %1
mova [r0+2*FDEC_STRIDEB+16], %2
mova [r0+3*FDEC_STRIDEB+ 0], %1
mova [r0+3*FDEC_STRIDEB+16], %2
add r0, 4*FDEC_STRIDEB
dec r1d
jg .loop
%else
add r0, 4*FDEC_STRIDEB
mova [r0 + -4*FDEC_STRIDEB], %1
mova [r0 + -3*FDEC_STRIDEB], %1
mova [r0 + -2*FDEC_STRIDEB], %1
mova [r0 + -1*FDEC_STRIDEB], %1
mova [r0 + 0*FDEC_STRIDEB], %1
mova [r0 + 1*FDEC_STRIDEB], %1
mova [r0 + 2*FDEC_STRIDEB], %1
mova [r0 + 3*FDEC_STRIDEB], %1
add r0, 8*FDEC_STRIDEB
mova [r0 + -4*FDEC_STRIDEB], %1
mova [r0 + -3*FDEC_STRIDEB], %1
mova [r0 + -2*FDEC_STRIDEB], %1
mova [r0 + -1*FDEC_STRIDEB], %1
mova [r0 + 0*FDEC_STRIDEB], %1
mova [r0 + 1*FDEC_STRIDEB], %1
mova [r0 + 2*FDEC_STRIDEB], %1
mova [r0 + 3*FDEC_STRIDEB], %1
%endif
%endmacro
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS0 6
mov%6 %5, %2
%macro PRED8x8_LOWPASS 5-6
%ifidn %1, w
mova %2, %5
paddw %3, %4
psrlw %3, 1
pavgw %2, %3
%else
mova %6, %3
pavgb %3, %4
pxor %4, %6
mova %2, %5
pand %4, [pb_1]
psubusb %3, %4
pavgb %2, %3
pxor %3, %5
mov%6 %1, %4
pand %3, [pb_1]
psubusb %2, %3
pavgb %1, %2
%endmacro
%macro PRED8x8_LOWPASS 5
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
%endmacro
%macro PRED8x8_LOWPASS_XMM 5
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
%endif
%endmacro
%macro LOAD_PLANE_ARGS 0
......@@ -129,115 +164,186 @@ cextern pb_reverse
%endmacro
;-----------------------------------------------------------------------------
; void predict_4x4_ddl( uint8_t *src )
; void predict_4x4_ddl( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_ddl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
psllq mm1, 8
pxor mm2, mm1
psrlq mm2, 8
pxor mm3, mm2
PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5
%macro PREDICT_4x4_DDL 4
cglobal predict_4x4_ddl_%1, 1,1
mova m1, [r0-FDEC_STRIDEB]
mova m2, m1
mova m3, m1
mova m4, m1
psll%2 m1, %3
pxor m2, m1
psrl%2 m2, %3
pxor m3, m2
PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
%assign Y 0
%rep 4
psrlq mm0, 8
movd [r0+Y*FDEC_STRIDE], mm0
psrl%2 m0, %3
movh [r0+Y*FDEC_STRIDEB], m0
%assign Y (Y+1)
%endrep
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_4x4_DDL sse2, dq, 2, w
INIT_MMX
%define PALIGNR PALIGNR_MMX
cglobal predict_4x4_ddl_mmxext, 1,2
mova m1, [r0-2*FDEC_STRIDE+4]
mova m2, [r0-2*FDEC_STRIDE+0]
mova m3, [r0-2*FDEC_STRIDE+2]
PRED8x8_LOWPASS w, m0, m1, m2, m3
mova [r0+0*FDEC_STRIDE], m0
mova m5, [r0-2*FDEC_STRIDE+6]
mova m6, [r0-2*FDEC_STRIDE+8]
pshufw m7, m6, 0xF9
PRED8x8_LOWPASS w, m4, m7, m5, m6
mova [r0+6*FDEC_STRIDE], m4
psllq m0, 16
PALIGNR m4, m0, 6, m1
mova [r0+4*FDEC_STRIDE], m4
psllq m0, 16
PALIGNR m4, m0, 6, m0
mova [r0+2*FDEC_STRIDE], m4
RET
%else
INIT_MMX
PREDICT_4x4_DDL mmxext, q , 8, b
%endif
;-----------------------------------------------------------------------------
; void predict_4x4_ddr( uint8_t *src )
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4 1
%macro PREDICT_4x4 7
cglobal predict_4x4_ddr_%1, 1,1
movq mm1, [r0+1*FDEC_STRIDE-8]
movq mm2, [r0+0*FDEC_STRIDE-8]
punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
movd mm3, [r0-1*FDEC_STRIDE]
punpckhwd mm1, mm2
PALIGNR mm3, mm1, 5, mm1
movq mm1, mm3
PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
movq mm2, mm3
PALIGNR mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4
mova m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
punpckh%2 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
movh m3, [r0-1*FDEC_STRIDEB]
punpckh%3 m1, m2
PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
mova m1, m3
PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
mova m2, m3
PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
%assign Y 3
movd [r0+Y*FDEC_STRIDE], mm0
movh [r0+Y*FDEC_STRIDEB], m0
%rep 3
%assign Y (Y-1)
psrlq mm0, 8
movd [r0+Y*FDEC_STRIDE], mm0
psrl%4 m0, %7
movh [r0+Y*FDEC_STRIDEB], m0
%endrep
RET
cglobal predict_4x4_vr_%1, 1,1
movd mm0, [r0-1*FDEC_STRIDE] ; ........t3t2t1t0
movq mm7, mm0
PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1 ; ......t3t2t1t0lt
pavgb mm7, mm0
PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1 ; ....t3t2t1t0ltl0
movq mm1, mm0
PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2 ; ..t3t2t1t0ltl0l1
movq mm2, mm0
PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3 ; t3t2t1t0ltl0l1l2
PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
movq mm1, mm3
psrlq mm3, 16
psllq mm1, 48
movd [r0+0*FDEC_STRIDE], mm7
movd [r0+1*FDEC_STRIDE], mm3
PALIGNR mm7, mm1, 7, mm2
psllq mm1, 8
movd [r0+2*FDEC_STRIDE], mm7
PALIGNR mm3, mm1, 7, mm1
movd [r0+3*FDEC_STRIDE], mm3
cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
mova m5, m0
PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
pavg%5 m5, m0
PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
mova m1, m0
PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
mova m2, m0
PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
mova m1, m3
psrl%4 m3, %7*2
psll%4 m1, %7*6
movh [r0+0*FDEC_STRIDEB], m5
movh [r0+1*FDEC_STRIDEB], m3
PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
psll%4 m1, %7
movh [r0+2*FDEC_STRIDEB], m5
PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
movh [r0+3*FDEC_STRIDEB], m3
RET
cglobal predict_4x4_hd_%1, 1,1
movd mm0, [r0-1*FDEC_STRIDE-4] ; lt ..
punpckldq mm0, [r0-1*FDEC_STRIDE] ; t3 t2 t1 t0 lt .. .. ..
psllq mm0, 8 ; t2 t1 t0 lt .. .. .. ..
movq mm1, [r0+3*FDEC_STRIDE-8] ; l3
punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3
movq mm2, [r0+1*FDEC_STRIDE-8] ; l1
punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1
punpckhwd mm1, mm2 ; l0 l1 l2 l3
punpckhdq mm1, mm0 ; t2 t1 t0 lt l0 l1 l2 l3
movq mm0, mm1
movq mm2, mm1
movq mm7, mm1
psrlq mm0, 16 ; .. .. t2 t1 t0 lt l0 l1
psrlq mm2, 8 ; .. t2 t1 t0 lt l0 l1 l2
pavgb mm7, mm2
PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
punpcklbw mm7, mm3
psrlq mm3, 32
PALIGNR mm3, mm7, 6, mm6
cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
punpckl%6 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
mova m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
punpckh%2 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
mova m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
punpckh%2 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
punpckh%3 m1, m2 ; l0 l1 l2 l3
punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
mova m0, m1
mova m2, m1
mova m5, m1
psrl%4 m0, %7*2 ; .. .. t2 t1 t0 lt l0 l1
psrl%4 m2, %7 ; .. t2 t1 t0 lt l0 l1 l2
pavg%5 m5, m2
PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
punpckl%2 m5, m3
psrl%4 m3, %7*4
PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
%assign Y 3
movd [r0+Y*FDEC_STRIDE], mm7
movh [r0+Y*FDEC_STRIDEB], m5
%rep 2
%assign Y (Y-1)
psrlq mm7, 16
movd [r0+Y*FDEC_STRIDE], mm7
psrl%4 m5, %7*2
movh [r0+Y*FDEC_STRIDEB], m5
%endrep
movd [r0+0*FDEC_STRIDE], mm3
movh [r0+0*FDEC_STRIDEB], m3
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
%define PALIGNR PALIGNR_SSSE3
PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
PREDICT_4x4 mmxext
PREDICT_4x4 mmxext, bw, wd, q , b, dq , 8
%define PALIGNR PALIGNR_SSSE3
PREDICT_4x4 ssse3
PREDICT_4x4 ssse3 , bw, wd, q , b, dq , 8
%endif
;-----------------------------------------------------------------------------
; void predict_4x4_hu( uint8_t *src )
; void predict_4x4_hu( pixel *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
cglobal predict_4x4_hu_sse2, 1,1,6
movq mm0, [r0+0*FDEC_STRIDEB-4*2]
punpckhwd mm0, [r0+1*FDEC_STRIDEB-4*2]
movq mm1, [r0+2*FDEC_STRIDEB-4*2]
punpckhwd mm1, [r0+3*FDEC_STRIDEB-4*2]
punpckhdq mm0, mm1
pshufw mm1, mm1, 0xFF
movq2dq m0, mm0
movq2dq m1, mm1
punpcklqdq m0, m1
mova m2, m0
mova m3, m0
mova m1, m0
psrldq m2, 4
psrldq m3, 2
pavgw m1, m3
PRED8x8_LOWPASS w, m4, m0, m2, m3, m5
punpcklwd m1, m4
movq [r0+0*FDEC_STRIDEB], m1
psrldq m1, 4
movq [r0+1*FDEC_STRIDEB], m1
psrldq m1, 4
movq [r0+2*FDEC_STRIDEB], m1
movq [r0+3*FDEC_STRIDEB], mm1
RET
%else
INIT_MMX
cglobal predict_4x4_hu_mmxext, 1,1
movq mm0, [r0+0*FDEC_STRIDE-8]
punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
......@@ -254,7 +360,7 @@ cglobal predict_4x4_hu_mmxext, 1,1
psrlq mm2, 16
psrlq mm3, 8
pavgb mm7, mm3
PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5
PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
punpcklbw mm7, mm4
%assign Y 0
movd [r0+Y*FDEC_STRIDE], mm7
......@@ -265,34 +371,94 @@ cglobal predict_4x4_hu_mmxext, 1,1
%endrep
movd [r0+3*FDEC_STRIDE], mm1
RET
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void predict_4x4_vl( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4_V1 4
cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
mova m1, [r0-FDEC_STRIDEB]
mova m3, m1
mova m2, m1
psrl%2 m3, %3
psrl%2 m2, %3*2
mova m4, m3
pavg%4 m4, m1
PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
movh [r0+0*FDEC_STRIDEB], m4
movh [r0+1*FDEC_STRIDEB], m0
psrl%2 m4, %3
psrl%2 m0, %3
movh [r0+2*FDEC_STRIDEB], m4
movh [r0+3*FDEC_STRIDEB], m0
RET
%endmacro
;-----------------------------------------------------------------------------
; void predict_4x4_vl( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_vl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
movq mm3, mm1
movq mm2, mm1
psrlq mm3, 8
psrlq mm2, 16
movq mm4, mm3
pavgb mm4, mm1
PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
movd [r0+0*FDEC_STRIDE], mm4
movd [r0+1*FDEC_STRIDE], mm0
psrlq mm4, 8
psrlq mm0, 8
movd [r0+2*FDEC_STRIDE], mm4
movd [r0+3*FDEC_STRIDE], mm0
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_4x4_V1 sse2, dq, 2, w
INIT_MMX
%define PALIGNR PALIGNR_MMX
cglobal predict_4x4_vl_mmxext, 1,4
mova m1, [r0-FDEC_STRIDEB+0]
mova m2, [r0-FDEC_STRIDEB+8]
mova m3, m2
PALIGNR m2, m1, 4, m6
PALIGNR m3, m1, 2, m5
mova m4, m3
pavgw m4, m1
mova [r0+0*FDEC_STRIDEB], m4
psrlq m4, 16
mova [r0+2*FDEC_STRIDEB], m4
PRED8x8_LOWPASS w, m0, m1, m2, m3, m6
mova [r0+1*FDEC_STRIDEB], m0
psrlq m0, 16
mova [r0+3*FDEC_STRIDEB], m0
movzx r1d, word [r0-FDEC_STRIDEB+ 8]
movzx r2d, word [r0-FDEC_STRIDEB+10]
movzx r3d, word [r0-FDEC_STRIDEB+12]
lea r1d, [r1+r2+1]
add r3d, r2d
lea r3d, [r3+r1+1]
shr r1d, 1
shr r3d, 2
mov [r0+2*FDEC_STRIDEB+6], r1w
mov [r0+3*FDEC_STRIDEB+6], r3w
RET
%else
INIT_MMX
PREDICT_4x4_V1 mmxext, q , 8, b
%endif
;-----------------------------------------------------------------------------
; void predict_4x4_dc( uint8_t *src )
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_dc_mmxext, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
psrlq m2, 48
mova m0, [r0-FDEC_STRIDEB]
HADDW m0, m1
paddw m0, [pw_4]
paddw m0, m2
psrlw m0, 3
SPLATW m0, m0
mova [r0+0*FDEC_STRIDEB], m0
mova [r0+1*FDEC_STRIDEB], m0
mova [r0+2*FDEC_STRIDEB], m0
mova [r0+3*FDEC_STRIDEB], m0
RET
%else
INIT_MMX
cglobal predict_4x4_dc_mmxext, 1,4
pxor mm7, mm7
movd mm0, [r0-FDEC_STRIDE]
......@@ -313,12 +479,13 @@ cglobal predict_4x4_dc_mmxext, 1,4
mov [r0+FDEC_STRIDE*2], r1d
mov [r0+FDEC_STRIDE*3], r1d
RET
%endif ; HIGH_BIT_DEPTH
%macro PREDICT_FILTER 1
;-----------------------------------------------------------------------------
;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal predict_8x8_filter_%1, 4,5
add r0, 0x58
%define src r0-0x58
......@@ -353,10 +520,10 @@ cglobal predict_8x8_filter_%1, 4,5
je .fix_lt_1
.do_left:
movq mm0, mm4
PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5
PRED8x8_LOWPASS b, mm2, mm1, mm4, mm3, mm5
movq [t1+8], mm2
movq mm4, mm0
PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5
PRED8x8_LOWPASS b, mm1, mm3, mm0, mm4, mm5
movd t4, mm1
mov [t1+7], t4b
.check_top:
......@@ -374,7 +541,7 @@ cglobal predict_8x8_filter_%1, 4,5
test r2b, 0x04
je .fix_tr_1
.do_top:
PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5
PRED8x8_LOWPASS b, mm4, mm2, mm1, mm3, mm5
movq [t1+16], mm4
test r3b, 0x04
je .done
......@@ -387,7 +554,7 @@ cglobal predict_8x8_filter_%1, 4,5
psrlq mm5, 56
PALIGNR mm2, mm3, 7, mm3
PALIGNR mm5, mm4, 1, mm4
PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4
PRED8x8_LOWPASS b, mm1, mm2, mm5, mm0, mm4
jmp .do_topright
.fix_tr_2:
punpckhbw mm3, mm3
......@@ -424,47 +591,74 @@ cglobal predict_8x8_filter_%1, 4,5
%endmacro
%define PALIGNR PALIGNR_MMX
INIT_MMX
PREDICT_FILTER mmxext
%define PALIGNR PALIGNR_SSSE3
PREDICT_FILTER ssse3
;-----------------------------------------------------------------------------
; void predict_8x8_v( uint8_t *src, uint8_t *edge )
; void predict_8x8_v( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_v_mmxext, 2,2
movq mm0, [r1+16]
STORE8x8 mm0, mm0
%macro PREDICT_8x8_V 1
cglobal predict_8x8_v_%1, 2,2
mova m0, [r1+16*SIZEOF_PIXEL]
STORE8x8 m0, m0
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
PREDICT_8x8_V sse2
%else
INIT_MMX
PREDICT_8x8_V mmxext
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
; void predict_8x8_h( pixel *src, pixel edge[33] )
;-----------------------------------------------------------------------------