Commit 1f572510 authored by Fiona Glaser's avatar Fiona Glaser

Some cosmetics/cleanup

Move some macros to x86util.asm that should have been there to begin with.
Fix a typo that didn't cause any issues.
parent 57505e30
......@@ -391,64 +391,8 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
; SATD
;=============================================================================
%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10 GLOBAL]
%define shift 16
%elifidn %1, q
%define mask [mask_1100 GLOBAL]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
mova m%5, mask ; ff00
mova m%6, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pand m%6, m%5 ; x5..
pandn m%5, m%3 ; ..x0
psrl%1 m%3, shift ; ..x1
por m%4, m%5 ; x4x0
por m%3, m%6 ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
mova m%5, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pxor m%4, m%3 ; (x4^x1)x0
pand m%4, mask ; (x4^x1)..
pxor m%3, m%4 ; x4x0
psrl%1 m%4, shift ; ..(x1^x4)
pxor m%5, m%4 ; x5x1
SWAP %4, %3, %5
%endif
%endmacro
%define TRANS TRANS_SSE2
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
mova m%5, m%3
%ifidn %2, ord
psrl%1 m%3, 16
%endif
pblendw m%3, m%4, 10101010b
psll%1 m%4, 16
%ifidn %2, ord
pblendw m%4, m%5, 01010101b
%else
psrl%1 m%5, 16
por m%4, m%5
%endif
%elifidn %1, q
mova m%5, m%3
shufps m%3, m%4, 10001000b
shufps m%5, m%4, 11011101b
SWAP %4, %5
%endif
%endmacro
%macro JDUP_SSE2 2
punpckldq %1, %2
; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
......
......@@ -222,6 +222,62 @@
SUMSUB_BADC %3, %7, %4, %8
%endmacro
%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10 GLOBAL]
%define shift 16
%elifidn %1, q
%define mask [mask_1100 GLOBAL]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
mova m%5, mask ; ff00
mova m%6, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pand m%6, m%5 ; x5..
pandn m%5, m%3 ; ..x0
psrl%1 m%3, shift ; ..x1
por m%4, m%5 ; x4x0
por m%3, m%6 ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
mova m%5, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pxor m%4, m%3 ; (x4^x1)x0
pand m%4, mask ; (x4^x1)..
pxor m%3, m%4 ; x4x0
psrl%1 m%4, shift ; ..(x1^x4)
pxor m%5, m%4 ; x5x1
SWAP %4, %3, %5
%endif
%endmacro
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
mova m%5, m%3
%ifidn %2, ord
psrl%1 m%3, 16
%endif
pblendw m%3, m%4, 10101010b
psll%1 m%4, 16
%ifidn %2, ord
pblendw m%4, m%5, 01010101b
%else
psrl%1 m%5, 16
por m%4, m%5
%endif
%elifidn %1, q
mova m%5, m%3
shufps m%3, m%4, 10001000b
shufps m%5, m%4, 11011101b
SWAP %4, %5
%endif
%endmacro
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
......
......@@ -728,7 +728,7 @@ x264_t *x264_encoder_open ( x264_param_t *param )
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
if( !h->param.b_cabac );
if( !h->param.b_cabac )
x264_init_vlc_tables();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment