Commit 6d7c5efc authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Clean up and optimize weightp, plus enable SSSE3 weight on SB/BDZ

Also remove unused AVX cruft.
parent 047175e6
......@@ -252,7 +252,8 @@ AVG_WEIGHT 16, 7
;=============================================================================
%if HIGH_BIT_DEPTH
%macro WEIGHT_START 1 ; (width)
; width
%macro WEIGHT_START 1
mova m0, [r4+ 0] ; 1<<denom
mova m3, [r4+16]
movd m2, [r4+32] ; denom
......@@ -260,7 +261,8 @@ AVG_WEIGHT 16, 7
paddw m2, [sq_1] ; denom+1
%endmacro
%macro WEIGHT 2 ; (src1, src2)
; src1, src2
%macro WEIGHT 2
movh m5, [%1]
movh m6, [%2]
punpcklwd m5, m0
......@@ -272,7 +274,8 @@ AVG_WEIGHT 16, 7
packssdw m5, m6
%endmacro
%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
; src, dst, width
%macro WEIGHT_TWO_ROW 3
%assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
......@@ -297,109 +300,98 @@ AVG_WEIGHT 16, 7
%macro WEIGHT_START 1
mova m3, [r4]
mova m6, [r4+16]
mova m4, [r4+16]
%if notcpuflag(ssse3) || cpuflag(xop)
movd m5, [r4+32]
pxor m2, m2
%if (%1 == 20 || %1 == 12) && mmsize == 16
movdq2q mm3, xmm3
movdq2q mm4, xmm4
movdq2q mm5, xmm5
movdq2q mm6, xmm6
pxor mm2, mm2
%endif
%endmacro
%macro WEIGHT_START_SSSE3 1
mova m3, [r4]
mova m4, [r4+16]
pxor m2, m2
%if %1 == 20 || %1 == 12
movdq2q mm3, xmm3
movdq2q mm4, xmm4
pxor mm2, mm2
%endif
%endmacro
;; macro to weight mmsize bytes taking half from %1 and half from %2
%macro WEIGHT 2 ; (src1,src2)
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2 ;setup
punpcklbw m1, m2 ;setup
pmullw m0, m3 ;scale
pmullw m1, m3 ;scale
paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
psraw m0, m5 ;denom
psraw m1, m5 ;denom
; src1, src2, dst1, dst2
%macro WEIGHT_ROWx2 4
movh m0, [%1 ]
movh m1, [%1+mmsize/2]
movh m6, [%2 ]
movh m7, [%2+mmsize/2]
punpcklbw m0, m2
punpcklbw m1, m2
punpcklbw m6, m2
punpcklbw m7, m2
%if cpuflag(ssse3)
psllw m0, 7
psllw m1, 7
psllw m6, 7
psllw m7, 7
pmulhrsw m0, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
pmulhrsw m7, m3
paddw m0, m4
paddw m1, m4
paddw m6, m4
paddw m7, m4
%else
pmullw m0, m3
pmullw m1, m3
pmullw m6, m3
pmullw m7, m3
paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
paddsw m1, m4
paddsw m6, m4
paddsw m7, m4
psraw m0, m5
psraw m1, m5
psraw m6, m5
psraw m7, m5
%endif
packuswb m0, m1
packuswb m6, m7
mova [%3], m0
mova [%4], m6
%endmacro
%macro WEIGHT_SSSE3 2
; src1, src2, dst1, dst2, width
%macro WEIGHT_COL 5
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2
punpcklbw m1, m2
%if cpuflag(ssse3)
psllw m0, 7
psllw m1, 7
pmulhrsw m0, m3
pmulhrsw m1, m3
paddw m0, m4
paddw m1, m4
%endmacro
%macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
%if %3 == 16
mova [%2], %1
%elif %3 == 8
movq [%2], %1
%else
movd [%2], %1 ; width 2 can write garbage for last 2 bytes
pmullw m0, m3
pmullw m1, m3
paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
paddsw m1, m4
psraw m0, m5
psraw m1, m5
%endif
%endmacro
%macro WEIGHT_ROW 3 ; (src,dst,width)
;; load weights
WEIGHT %1, (%1+(mmsize/2))
packuswb m0, m1 ;put bytes into m0
WEIGHT_SAVE_ROW m0, %2, %3
%endmacro
%macro WEIGHT_SAVE_COL 2 ;(dst,size)
%if %2 == 8
%if %5 == 8
packuswb m0, m1
movq [%1], m0
movhps [%1+r1], m0
movh [%3], m0
movhps [%4], m0
%else
packuswb m0, m0
packuswb m1, m1
movd [%1], m0 ; width 2 can write garbage for last 2 bytes
movd [%1+r1], m1
movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
movd [%4], m1
%endif
%endmacro
%macro WEIGHT_COL 3 ; (src,dst,width)
%if %3 <= 4 && mmsize == 16
INIT_MMX
;; load weights
WEIGHT %1, (%1+r3)
WEIGHT_SAVE_COL %2, %3
INIT_XMM
%else
WEIGHT %1, (%1+r3)
WEIGHT_SAVE_COL %2, %3
%endif
%endmacro
%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
; src, dst, width
%macro WEIGHT_TWO_ROW 3
%assign x 0
%rep %3
%if (%3-x) >= mmsize
WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
%assign x (x+mmsize)
%else
WEIGHT_COL (%1+x),(%2+x),(%3-x)
WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
%exitrep
%endif
%if x >= %3
......@@ -414,34 +406,15 @@ AVG_WEIGHT 16, 7
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
%define NUMREGS 6
%define LOAD_HEIGHT
%define HEIGHT_REG r5d
%define TMP_REG r6d
%else
%define NUMREGS 5
%define TMP_REG r5d
%define LOAD_HEIGHT mov r4d, r5m
%define HEIGHT_REG r4d
%endif
%assign XMMREGS 7
%if HIGH_BIT_DEPTH
%assign NUMREGS NUMREGS+1
%assign XMMREGS 8
%endif
%macro WEIGHTER 1
cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
cglobal mc_weight_w%1, 6,6,8
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
.loop:
WEIGHT_TWO_ROW r2, r0, %1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub HEIGHT_REG, 2
sub r5d, 2
jg .loop
REP_RET
%endmacro
......@@ -458,24 +431,13 @@ WEIGHTER 16
WEIGHTER 20
%if HIGH_BIT_DEPTH
WEIGHTER 12
INIT_XMM avx
WEIGHTER 8
WEIGHTER 12
WEIGHTER 16
WEIGHTER 20
%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
INIT_MMX ssse3
WEIGHTER 4
INIT_XMM ssse3
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
INIT_XMM avx
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
%endif
%macro OFFSET_OP 7
......@@ -520,7 +482,7 @@ WEIGHTER 20
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 2
cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
cglobal mc_offset%2_w%1, 6,6
FIX_STRIDES r1, r3
mova m2, [r4]
%if HIGH_BIT_DEPTH
......@@ -528,12 +490,11 @@ WEIGHTER 20
mova m3, [pw_pixel_max]
%endif
%endif
LOAD_HEIGHT
.loop:
OFFSET_TWO_ROW r2, r0, %1, %2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub HEIGHT_REG, 2
sub r5d, 2
jg .loop
REP_RET
%endmacro
......@@ -552,20 +513,10 @@ INIT_XMM sse2
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
INIT_XMM avx
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
%if HIGH_BIT_DEPTH
INIT_XMM sse2
OFFSETPN 8
INIT_XMM avx
OFFSETPN 8
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
%undef NUMREGS
;=============================================================================
......
......@@ -72,11 +72,6 @@ MC_WEIGHT( 8, ssse3 )
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
MC_WEIGHT( 4, avx )
MC_WEIGHT( 8, avx )
MC_WEIGHT( 12, avx )
MC_WEIGHT( 16, avx )
MC_WEIGHT( 20, avx )
#undef MC_OFFSET
#undef MC_WEIGHT
......@@ -745,6 +740,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
pf->integral_init8h = x264_integral_init8h_avx;
pf->hpel_filter = x264_hpel_filter_avx;
/* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
pf->weight_cache = x264_weight_cache_ssse3;
pf->weight = x264_mc_weight_wtab_ssse3;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment