Commit 0743869d authored by Fiona Glaser's avatar Fiona Glaser

More intra pred asm optimizations

SSSE3 version of predict_8x8_hu
SSE2 version of predict_8x8c_p
SSSE3 versions of both planar prediction functions
Optimizations to predict_16x16_p_sse2
Some unnecessary REP_RETs -> RETs.
SSE2 version of predict_8x8_vr by Holger.
SSE2 version of predict_8x8_hd.
Don't compile MMX versions of some of the pred functions on x86_64.
Remove now-useless x86_64 C versions of 4x4 pred functions.
Rewrite some of the x86_64-only C functions in asm.
parent 3c5cb4f1
......@@ -4,6 +4,7 @@
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
......@@ -86,6 +87,8 @@ pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
pw_ff00: times 8 dw 0xff00
pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
SECTION .text
......@@ -107,6 +110,20 @@ SECTION .text
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
%endmacro
%macro LOAD_PLANE_ARGS 0
%ifdef ARCH_X86_64
movd mm0, r1d
movd mm2, r2d
movd mm4, r3d
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
%else
pshufw mm0, r1m, 0
pshufw mm2, r2m, 0
pshufw mm4, r3m, 0
%endif
%endmacro
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
......@@ -141,7 +158,7 @@ cglobal predict_4x4_ddr_%1, 1,1
punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
movd mm3, [r0-1*FDEC_STRIDE]
punpckhwd mm1, mm2
PALIGNR mm3, mm1, 5, mm4
PALIGNR mm3, mm1, 5, mm1
movq mm1, mm3
PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
movq mm2, mm3
......@@ -175,7 +192,7 @@ cglobal predict_4x4_vr_%1, 1,1
PALIGNR mm7, mm1, 7, mm2
psllq mm1, 8
movd [r0+2*FDEC_STRIDE], mm7
PALIGNR mm3, mm1, 7, mm2
PALIGNR mm3, mm1, 7, mm1
movd [r0+3*FDEC_STRIDE], mm3
RET
......@@ -539,6 +556,156 @@ cglobal predict_8x8_ddr_mmxext, 2,2
movq [r0+Y*FDEC_STRIDE], mm0
RET
;-----------------------------------------------------------------------------
; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_hu_mmxext, 2,2
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
add r0, 4*FDEC_STRIDE
pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm2, mm0
psllw mm0, 8
psrlw mm2, 8
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
movq mm3, mm2
movq mm4, mm2
movq mm5, mm2
psrlq mm2, 8
psrlq mm3, 16
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm1, mm1
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
pavgb mm4, mm2
PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
movq mm5, mm4
punpcklbw mm4, mm1 ; p4 p3 p2 p1
punpckhbw mm5, mm1 ; p8 p7 p6 p5
movq mm6, mm5
movq mm7, mm5
movq mm0, mm5
PALIGNR mm5, mm4, 2, mm1
pshufw mm1, mm6, 11111001b
PALIGNR mm6, mm4, 4, mm2
pshufw mm2, mm7, 11111110b
PALIGNR mm7, mm4, 6, mm4
pshufw mm3, mm0, 11111111b
movq [r0-4*FDEC_STRIDE], mm4
movq [r0-3*FDEC_STRIDE], mm5
movq [r0-2*FDEC_STRIDE], mm6
movq [r0-1*FDEC_STRIDE], mm7
movq [r0+0*FDEC_STRIDE], mm0
movq [r0+1*FDEC_STRIDE], mm1
movq [r0+2*FDEC_STRIDE], mm2
movq [r0+3*FDEC_STRIDE], mm3
RET
;-----------------------------------------------------------------------------
; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
; f01234567
; 0........
; 1,,,,,,,,
; 2 .......
; 3 ,,,,,,,
; 4 ......
; 5 ,,,,,,
; 6 .....
; 7 ,,,,,
cglobal predict_8x8_vr_core_mmxext, 2,2
movq mm2, [r1+16]
movq mm3, [r1+15]
movq mm1, [r1+14]
movq mm4, mm3
pavgb mm3, mm2
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
%assign Y 0
%rep 3
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
psllq mm3, 8
psllq mm0, 8
%assign Y (Y+2)
%endrep
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
RET
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
mov r1d, 8
ALIGN 4
.loop:
movq mm5, mm0
movq mm6, mm1
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0], mm5
paddsw mm0, mm4
paddsw mm1, mm4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210 GLOBAL]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
mov r1d, 16
ALIGN 4
.loop:
movq mm5, mm0
movq mm6, mm1
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0], mm5
movq mm5, mm2
movq mm6, mm3
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0+8], mm5
paddsw mm0, mm4
paddsw mm1, mm4
paddsw mm2, mm4
paddsw mm3, mm4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
%endif ; !ARCH_X86_64
;-----------------------------------------------------------------------------
......@@ -614,42 +781,40 @@ cglobal predict_8x8_vl_sse2, 2,2
RET
;-----------------------------------------------------------------------------
; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
; f01234567
; 0........
; 1,,,,,,,,
; 2 .......
; 3 ,,,,,,,
; 4 ......
; 5 ,,,,,,
; 6 .....
; 7 ,,,,,
cglobal predict_8x8_vr_core_mmxext, 2,2
movq mm2, [r1+16]
movq mm3, [r1+15]
movq mm1, [r1+14]
movq mm4, mm3
pavgb mm3, mm2
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
%assign Y 0
cglobal predict_8x8_vr_sse2, 2,2
movdqu xmm0, [r1+8]
movdqa xmm6, [pw_ff00 GLOBAL]
add r0, 4*FDEC_STRIDE
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pslldq xmm0, 1
pslldq xmm1, 2
pavgb xmm2, xmm0
PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5
pandn xmm6, xmm4
movdqa xmm5, xmm4
psrlw xmm4, 8
packuswb xmm6, xmm4
movhlps xmm4, xmm6
movhps [r0-3*FDEC_STRIDE], xmm5
movhps [r0-4*FDEC_STRIDE], xmm2
psrldq xmm5, 4
movss xmm5, xmm6
psrldq xmm2, 4
movss xmm2, xmm4
%assign Y 3
%rep 3
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
psllq mm3, 8
psllq mm0, 8
%assign Y (Y+2)
psrldq xmm5, 1
psrldq xmm2, 1
movq [r0+Y*FDEC_STRIDE], xmm5
movq [r0+(Y-1)*FDEC_STRIDE], xmm2
%assign Y (Y-2)
%endrep
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
RET
;-----------------------------------------------------------------------------
; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
......@@ -684,7 +849,7 @@ cglobal predict_8x8_hd_mmxext, 2,2
movq [r0+2*FDEC_STRIDE], mm7
PALIGNR mm1, mm3, 4, mm5
movq [r0+1*FDEC_STRIDE], mm1
PALIGNR mm0, mm3, 6, mm5
PALIGNR mm0, mm3, 6, mm3
movq [r0+0*FDEC_STRIDE], mm0
movq mm2, mm6
movq mm3, mm6
......@@ -693,23 +858,24 @@ cglobal predict_8x8_hd_mmxext, 2,2
movq [r0-2*FDEC_STRIDE], mm6
PALIGNR mm2, mm4, 4, mm5
movq [r0-3*FDEC_STRIDE], mm2
PALIGNR mm3, mm4, 6, mm5
PALIGNR mm3, mm4, 6, mm4
movq [r0-4*FDEC_STRIDE], mm3
RET
;-----------------------------------------------------------------------------
; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_hd_ssse3, 2,2
%macro PREDICT_8x8_HD 1
cglobal predict_8x8_hd_%1, 2,2
add r0, 4*FDEC_STRIDE
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm1
movdqa xmm3, xmm1
palignr xmm1, xmm0, 7
palignr xmm2, xmm0, 9
palignr xmm3, xmm0, 8
movdqa xmm4, xmm1
PALIGNR xmm1, xmm0, 7, xmm4
PALIGNR xmm2, xmm0, 9, xmm5
PALIGNR xmm3, xmm0, 8, xmm0
movdqa xmm4, xmm1
pavgb xmm4, xmm3
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
punpcklbw xmm4, xmm0
......@@ -726,63 +892,41 @@ cglobal predict_8x8_hd_ssse3, 2,2
movq [r0+(Y)*FDEC_STRIDE], xmm4
movq [r0+(Y-4)*FDEC_STRIDE], xmm0
RET
%endmacro
;-----------------------------------------------------------------------------
; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_hu_mmxext, 2,2
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
add r0, 4*FDEC_STRIDE
pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm2, mm0
psllw mm0, 8
psrlw mm2, 8
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
movq mm3, mm2
movq mm4, mm2
movq mm5, mm2
psrlq mm2, 8
psrlq mm3, 16
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm1, mm1
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
pavgb mm4, mm2
PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
movq mm5, mm4
punpcklbw mm4, mm1 ; p4 p3 p2 p1
punpckhbw mm5, mm1 ; p8 p7 p6 p5
movq mm6, mm5
movq mm7, mm5
movq mm0, mm5
PALIGNR mm5, mm4, 2, mm1
pshufw mm1, mm6, 11111001b
PALIGNR mm6, mm4, 4, mm2
pshufw mm2, mm7, 11111110b
PALIGNR mm7, mm4, 6, mm3
pshufw mm3, mm0, 11111111b
movq [r0-4*FDEC_STRIDE], mm4
movq [r0-3*FDEC_STRIDE], mm5
movq [r0-2*FDEC_STRIDE], mm6
movq [r0-1*FDEC_STRIDE], mm7
movq [r0+0*FDEC_STRIDE], mm0
movq [r0+1*FDEC_STRIDE], mm1
movq [r0+2*FDEC_STRIDE], mm2
movq [r0+3*FDEC_STRIDE], mm3
RET
INIT_XMM
PREDICT_8x8_HD sse2
%define PALIGNR PALIGNR_SSSE3
PREDICT_8x8_HD ssse3
INIT_MMX
%define PALIGNR PALIGNR_MMX
;-----------------------------------------------------------------------------
; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_hu_sse2, 2,2
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
%macro PREDICT_8x8_HU 1
cglobal predict_8x8_hu_%1, 2,2
add r0, 4*FDEC_STRIDE
%ifidn %1, ssse3
movq mm5, [r1+7]
movq mm6, [pb_reverse GLOBAL]
movq mm1, mm5
movq mm2, mm5
movq mm3, mm5
pshufb mm5, mm6
psrlq mm6, 8
pshufb mm2, mm6
psrlq mm6, 8
pshufb mm3, mm6
movq mm4, mm5
%else
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm2, mm0
psllw mm0, 8
psrlw mm2, 8
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
movq mm3, mm2
movq mm4, mm2
movq mm5, mm2
......@@ -791,30 +935,33 @@ cglobal predict_8x8_hu_sse2, 2,2
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm1, mm1
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
%endif
pavgb mm4, mm2
PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
movq2dq xmm0, mm4
movq2dq xmm1, mm1
punpcklbw xmm0, xmm1
movhlps xmm4, xmm0
pshuflw xmm5, xmm4, 11111001b
pshuflw xmm6, xmm4, 11111110b
pshuflw xmm7, xmm4, 11111111b
punpckhbw mm4, mm1
%assign Y -4
%rep 3
movq [r0+Y*FDEC_STRIDE], xmm0
psrldq xmm0, 2
%assign Y (Y+1)
%endrep
pshufw mm5, mm4, 11111001b
pshufw mm6, mm4, 11111110b
pshufw mm7, mm4, 11111111b
movq [r0+Y*FDEC_STRIDE], xmm0
movq [r0+0*FDEC_STRIDE], xmm4
movq [r0+1*FDEC_STRIDE], xmm5
movq [r0+2*FDEC_STRIDE], xmm6
movq [r0+3*FDEC_STRIDE], xmm7
movq [r0+0*FDEC_STRIDE], mm4
movq [r0+1*FDEC_STRIDE], mm5
movq [r0+2*FDEC_STRIDE], mm6
movq [r0+3*FDEC_STRIDE], mm7
RET
%endmacro
PREDICT_8x8_HU sse2
PREDICT_8x8_HU ssse3
;-----------------------------------------------------------------------------
; void predict_8x8c_v_mmx( uint8_t *src )
......@@ -885,90 +1032,65 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1
STORE8x8 mm0, mm2
RET
%macro LOAD_PLANE_ARGS 0
%ifdef ARCH_X86_64
movd mm0, r1d
movd mm2, r2d
movd mm4, r3d
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
%else
pshufw mm0, r1m, 0
pshufw mm2, r2m, 0
pshufw mm4, r3m, 0
%endif
%endmacro
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
mov r1d, 8
ALIGN 4
.loop:
movq mm5, mm0
movq mm6, mm1
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0], mm5
paddsw mm0, mm4
paddsw mm1, mm4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
cglobal predict_8x8c_dc_top_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
punpcklbw mm0, mm2
psadbw mm1, mm2 ; s1
psadbw mm0, mm2 ; s0
psrlw mm1, 1
psrlw mm0, 1
pavgw mm1, mm2
pavgw mm0, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
STORE8x8 mm0, mm0
RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210 GLOBAL]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
mov r1d, 16
ALIGN 4
cglobal predict_8x8c_p_core_sse2, 1,1
movd xmm0, r1m
movd xmm2, r2m
movd xmm4, r3m
pshuflw xmm0, xmm0, 0
pshuflw xmm2, xmm2, 0
pshuflw xmm4, xmm4, 0
punpcklqdq xmm0, xmm0
punpcklqdq xmm2, xmm2
punpcklqdq xmm4, xmm4
pmullw xmm2, [pw_76543210 GLOBAL]
paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
movdqa xmm3, xmm0
paddsw xmm3, xmm4
paddsw xmm4, xmm4
call .loop
add r0, FDEC_STRIDE*4
.loop:
movq mm5, mm0
movq mm6, mm1
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0], mm5
movq mm5, mm2
movq mm6, mm3
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0+8], mm5
paddsw mm0, mm4
paddsw mm1, mm4
paddsw mm2, mm4
paddsw mm3, mm4
add r0, FDEC_STRIDE
dec r1d
jg .loop
REP_RET
movdqa xmm5, xmm0
movdqa xmm1, xmm3
psraw xmm0, 5
psraw xmm3, 5
packuswb xmm0, xmm3
movq [r0+FDEC_STRIDE*0], xmm0
movhps [r0+FDEC_STRIDE*1], xmm0
paddsw xmm5, xmm4
paddsw xmm1, xmm4
movdqa xmm0, xmm5
movdqa xmm3, xmm1
psraw xmm5, 5
psraw xmm1, 5
packuswb xmm5, xmm1
movq [r0+FDEC_STRIDE*2], xmm5
movhps [r0+FDEC_STRIDE*3], xmm5
paddsw xmm0, xmm4
paddsw xmm3, xmm4
RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
......@@ -988,20 +1110,28 @@ cglobal predict_16x16_p_core_sse2, 1,2
psllw xmm1, 3
paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
mov r1d, 16
movdqa xmm7, xmm2
paddsw xmm7, xmm7
mov r1d, 8
ALIGN 4
.loop:
movdqa xmm3, xmm0
movdqa xmm4, xmm1
movdqa xmm5, xmm0
movdqa xmm6, xmm1
psraw xmm3, 5
psraw xmm4, 5
paddsw xmm5, xmm2
paddsw xmm6, xmm2
psraw xmm5, 5
psraw xmm6, 5
packuswb xmm3, xmm4
movdqa [r0], xmm3
paddsw xmm0, xmm2
paddsw xmm1, xmm2
add r0, FDEC_STRIDE
packuswb xmm5, xmm6
movdqa [r0+FDEC_STRIDE*0], xmm3
movdqa [r0+FDEC_STRIDE*1], xmm5
paddsw xmm0, xmm7
paddsw xmm1, xmm7
add r0, FDEC_STRIDE*2
dec r1d
jg .loop
REP_RET
......@@ -1018,10 +1148,10 @@ cglobal predict_16x16_v_mmx, 1,2
;-----------------------------------------------------------------------------
; void predict_16x16_v_sse2( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_16x16_v_sse2, 1,2
cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
STORE16x16_SSE2 xmm0
REP_RET
RET
;-----------------------------------------------------------------------------
; void predict_16x16_h_mmxext( uint8_t *src )
......@@ -1086,6 +1216,13 @@ cglobal predict_16x16_dc_top_mmxext, 1,2
PRED16x16_DC [pw_8 GLOBAL], 4
REP_RET
cglobal predict_16x16_dc_left_core_mmxext, 1,1
movd mm0, r1m
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE16x16 mm0, mm0
REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
......@@ -1103,12 +1240,19 @@ cglobal predict_16x16_dc_top_mmxext, 1,2
STORE16x16_SSE2 xmm0
%endmacro
cglobal predict_16x16_dc_core_sse2, 1,2
cglobal predict_16x16_dc_core_sse2, 1,1
movd xmm2, r1m
PRED16x16_DC_SSE2 xmm2, 5
REP_RET
RET
cglobal predict_16x16_dc_top_sse2, 1,2
cglobal predict_16x16_dc_top_sse2, 1,1
PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
REP_RET
RET
cglobal predict_16x16_dc_left_core_sse2, 1,1
movd xmm0, r1m
pshuflw xmm0, xmm0, 0
punpcklqdq xmm0, xmm0
packuswb xmm0, xmm0
STORE16x16_SSE2 xmm0
REP_RET
......@@ -29,10 +29,13 @@ extern void predict_16x16_v_mmx( uint8_t *src );
extern void predict_16x16_h_mmxext( uint8_t *src );
extern void predict_16x16_h_ssse3( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src );
extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
extern void predict_8x8c_v_mmx( uint8_t *src );
extern void predict_8x8c_h_mmxext( uint8_t *src );
extern void predict_8x8c_h_ssse3( uint8_t *src );
......@@ -48,9 +51,12 @@ extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
extern void predict_4x4_ddl_mmxext( uint8_t *src );
......@@ -65,9 +71,14 @@ extern void predict_4x4_ddr_ssse3( uint8_t *src );
extern void predict_4x4_hu_mmxext( uint8_t *src );
extern void predict_16x16_dc_top_sse2( uint8_t *src );
extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_v_sse2( uint8_t *src );
extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};