Commit abde94f6 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser

x86 asm for high-bit-depth pixel metrics

Overall speed change from these 6 asm patches: ~4.4x.
But there's still tons more asm to do -- patches welcome!

Breakdown from this patch:
~13x faster SAD than C.
~11.5x faster SATD than C (only MMX done).
~18.5x faster SA8D than C.
~19.2x faster hadamard_ac than C.
~8.3x faster SSD than C.
~12.4x faster VAR than C.
~3-4.2x faster intra SAD than C.
~7.9x faster intra SATD than C.
parent 3afd514e
......@@ -59,8 +59,13 @@ endif
# MMX/SSE optims
ifneq ($(AS),)
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm \
cpu-a.asm dct-32.asm bitstream-a.asm
ifneq ($(findstring X264_HIGH_BIT_DEPTH, $(CONFIG)),)
X86SRC0 += sad16-a.asm
else
X86SRC0 += sad-a.asm
endif
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
......
......@@ -482,51 +482,72 @@ SATD_X_DECL6( cpu )\
SATD_X( 4x4, cpu )
SATD_X_DECL7()
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
SATD_X_DECL7( _mmxext )
#if !X264_HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
#endif // !X264_HIGH_BIT_DEPTH
#endif
#if !X264_HIGH_BIT_DEPTH
#if HAVE_ARMV6
SATD_X_DECL7( _neon )
#endif
#endif // !X264_HIGH_BIT_DEPTH
#define INTRA_MBCMP_8x8( mbcmp )\
void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\
#define INTRA_MBCMP_8x8( mbcmp, cpu )\
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[33], int res[3] )\
{\
pixel pix[8*FDEC_STRIDE];\
x264_predict_8x8_v_c( pix, edge );\
res[0] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_8x8_h_c( pix, edge );\
res[1] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_8x8_dc_c( pix, edge );\
res[2] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
}
INTRA_MBCMP_8x8(sad)
INTRA_MBCMP_8x8(sa8d)
INTRA_MBCMP_8x8( sad, )
INTRA_MBCMP_8x8(sa8d, )
#if X264_HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmxext)
INTRA_MBCMP_8x8( sad, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3 )
INTRA_MBCMP_8x8(sa8d, _sse2 )
#endif
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
void x264_intra_##mbcmp##_x3_##size##x##size##chroma( pixel *fenc, pixel *fdec, int res[3] )\
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu )\
void x264_intra_##mbcmp##_x3_##size##x##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
{\
x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
res[0] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[0] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
res[1] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[1] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
res[2] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
res[2] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
}
INTRA_MBCMP(sad, 4, v, h, dc, )
INTRA_MBCMP(satd, 4, v, h, dc, )
INTRA_MBCMP(sad, 8, dc, h, v, c )
INTRA_MBCMP(satd, 8, dc, h, v, c )
INTRA_MBCMP(sad, 16, v, h, dc, )
INTRA_MBCMP(satd, 16, v, h, dc, )
INTRA_MBCMP( sad, 4, v, h, dc, , )
INTRA_MBCMP(satd, 4, v, h, dc, , )
INTRA_MBCMP( sad, 8, dc, h, v, c, )
INTRA_MBCMP(satd, 8, dc, h, v, c, )
INTRA_MBCMP( sad, 16, v, h, dc, , )
INTRA_MBCMP(satd, 16, v, h, dc, , )
#if X264_HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP( sad, 4, v, h, dc, , _mmxext)
INTRA_MBCMP(satd, 4, v, h, dc, , _mmxext)
INTRA_MBCMP( sad, 8, dc, h, v, c, _mmxext)
INTRA_MBCMP(satd, 8, dc, h, v, c, _mmxext)
INTRA_MBCMP( sad, 16, v, h, dc, , _mmxext)
INTRA_MBCMP(satd, 16, v, h, dc, , _mmxext)
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
/****************************************************************************
* structural similarity metric
......@@ -719,7 +740,94 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
#if !X264_HIGH_BIT_DEPTH
#if X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
{
INIT7( sad, _mmxext );
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT4( hadamard_ac, _mmxext );
INIT7( ssd, _mmxext );
INIT_ADS( _mmxext );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
}
if( cpu&X264_CPU_SSE2 )
{
INIT4_NAME( sad_aligned, sad, _sse2_aligned );
INIT5( ssd, _sse2 );
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT5( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT_ADS( _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse2 );
}
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
}
if( cpu&X264_CPU_SSE2_IS_FAST )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
}
if( cpu&X264_CPU_SSSE3 )
{
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
INIT_ADS( _ssse3 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _ssse3 );
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
}
#endif // HAVE_MMX
#else // !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -947,7 +1055,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
#endif // X264_HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
{
......
......@@ -8,6 +8,7 @@
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
;* Fiona Glaser <fiona@x264.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -46,7 +47,7 @@ mask_1100: times 2 dd 0, -1
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
pd_f0: times 4 dd 0xffff0000
pq_0f: times 2 dd 0xffffffff, 0
sq_0f: times 1 dq 0xffffffff
SECTION .text
......@@ -55,36 +56,95 @@ cextern pw_00ff
cextern hsub_mul
%macro HADDD 2 ; sum junk
%if mmsize == 16
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
%else
pshufw %2, %1, 0xE
paddd %1, %2
%endif
%endmacro
;=============================================================================
; SSD
;=============================================================================
%macro HADDW 2
pmaddwd %1, [pw_1]
HADDD %1, %2
%ifdef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SSD_ONE 3
cglobal pixel_ssd_%1x%2_%3, 4,5,6*(mmsize/16)
mov r4, %1*%2/mmsize
pxor m0, m0
.loop
mova m1, [r0]
%if %1 <= mmsize/2
mova m3, [r0+r1*2]
%define offset r3*2
%define num_rows 2
%else
mova m3, [r0+mmsize]
%define offset mmsize
%define num_rows 1
%endif
psubw m1, [r2]
psubw m3, [r2+offset]
pmaddwd m1, m1
pmaddwd m3, m3
dec r4
lea r0, [r0+r1*2*num_rows]
lea r2, [r2+r3*2*num_rows]
paddd m0, m1
paddd m0, m3
jg .loop
HADDD m0, m5
movd eax, m0
RET
%endmacro
%macro HADDUW 2
mova %2, %1
pslld %1, 16
psrld %2, 16
psrld %1, 16
paddd %1, %2
HADDD %1, %2
%macro SSD_16_MMX 2
cglobal pixel_ssd_%1x%2_mmxext, 4,5
mov r4, %1*%2/mmsize/2
pxor m0, m0
.loop
mova m1, [r0]
mova m2, [r2]
mova m3, [r0+mmsize]
mova m4, [r2+mmsize]
mova m5, [r0+mmsize*2]
mova m6, [r2+mmsize*2]
mova m7, [r0+mmsize*3]
psubw m1, m2
psubw m3, m4
mova m2, [r2+mmsize*3]
psubw m5, m6
pmaddwd m1, m1
psubw m7, m2
pmaddwd m3, m3
pmaddwd m5, m5
dec r4
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
pmaddwd m7, m7
paddd m1, m3
paddd m5, m7
paddd m0, m1
paddd m0, m5
jg .loop
HADDD m0, m7
movd eax, m0
RET
%endmacro
;=============================================================================
; SSD
;=============================================================================
INIT_MMX
SSD_ONE 4, 4, mmxext
SSD_ONE 4, 8, mmxext
SSD_ONE 8, 4, mmxext
SSD_ONE 8, 8, mmxext
SSD_ONE 8, 16, mmxext
SSD_16_MMX 16, 8
SSD_16_MMX 16, 16
INIT_XMM
SSD_ONE 8, 4, sse2
SSD_ONE 8, 8, sse2
SSD_ONE 8, 16, sse2
SSD_ONE 16, 8, sse2
SSD_ONE 16, 16, sse2
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
%macro SSD_LOAD_FULL 5
mova m1, [t0+%1]
mova m2, [t2+%2]
......@@ -310,9 +370,89 @@ INIT_MMX
SSD 4, 4, ssse3
SSD 4, 8, ssse3
%assign function_align 16
%endif ; !X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; The maximum width this function can handle without risk of overflow is given
; in the following equation:
;
; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
;
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
; distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
%ifdef X264_HIGH_BIT_DEPTH
%macro SSD_NV12 1-2 0
cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
shl r4d, 2
FIX_STRIDES r1, r3
add r0, r4
add r2, r4
xor r6, r6
pxor m4, m4
pxor m5, m5
mova m6, [sq_0f]
.loopy:
mov r6, r4
neg r6
pxor m2, m2
pxor m3, m3
.loopx:
mova m0, [r0+r6]
mova m1, [r0+r6+mmsize]
psubw m0, [r2+r6]
psubw m1, [r2+r6+mmsize]
%if mmsize == 8
pshufw m0, m0, 11011000b
pshufw m1, m1, 11011000b
%else
pshuflw m0, m0, 11011000b
pshuflw m1, m1, 11011000b
pshufhw m0, m0, 11011000b
pshufhw m1, m1, 11011000b
%endif
pmaddwd m0, m0
pmaddwd m1, m1
paddd m2, m0
paddd m3, m1
add r6, 2*mmsize
jl .loopx
%if mmsize == 8
SBUTTERFLY dq, 2, 3, 1
%else
mova m1, m2
shufps m2, m3, 10001000b
shufps m3, m1, 11011101b
%endif
HADDD m2, m1
HADDD m3, m1
pand m2, m6
pand m3, m6
paddq m4, m2
paddq m5, m3
add r0, r1
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
movq [r3], m4
movq [r4], m5
RET
%endmacro ; SSD_NV12
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height )
; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; This implementation can potentially overflow on image widths >= 11008 (or
; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
; 20). At sane distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
%macro SSD_NV12 1-2 0
cglobal pixel_ssd_nv12_core_%1, 6,7
......@@ -346,7 +486,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7
jg .loopy
mov r3, r6m
mov r4, r7m
mova m5, [pq_0f]
mova m5, [sq_0f]
HADDD m3, m0
HADDD m4, m0
pand m3, m5
......@@ -355,6 +495,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7
movq [r4], m4
RET
%endmacro ; SSD_NV12
%endif ; !X264_HIGHT_BIT_DEPTH
INIT_MMX
SSD_NV12 mmxext
......@@ -368,15 +509,25 @@ SSD_NV12 sse2
%macro VAR_START 1
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
%ifndef X264_HIGH_BIT_DEPTH
%if %1
mova m7, [pw_00ff]
%else
pxor m7, m7 ; zero
%endif
%endif ; !X264_HIGH_BIT_DEPTH
%endmacro
%macro VAR_END 0
HADDW m5, m7
%macro VAR_END 2
%ifdef X264_HIGH_BIT_DEPTH
%if mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
HADDW m5, m2
%endif
%else ; !X264_HIGH_BIT_DEPTH
HADDW m5, m2
%endif ; X264_HIGH_BIT_DEPTH
movd eax, m5
HADDD m6, m1
movd edx, m6
......@@ -405,19 +556,28 @@ SSD_NV12 sse2
%macro VAR_2ROW 2
mov r2d, %2
.loop:
%ifdef X264_HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+%1]
mova m4, [r0+%1+mmsize]
%else ; !X264_HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
mova m4, m3
punpcklbw m0, m7
punpckhbw m1, m7
%endif ; X264_HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
%ifndef X264_HIGH_BIT_DEPTH
punpcklbw m3, m7
punpckhbw m4, m7
%endif ; !X264_HIGH_BIT_DEPTH
dec r2d
VAR_CORE
jg .loop
......@@ -428,16 +588,43 @@ SSD_NV12 sse2
;-----------------------------------------------------------------------------
INIT_MMX
cglobal pixel_var_16x16_mmxext, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8, 16
VAR_END
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x8_mmxext, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
VAR_END
VAR_END 8, 8
INIT_XMM
%ifdef X264_HIGH_BIT_DEPTH
cglobal pixel_var_16x16_sse2, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
cglobal pixel_var_8x8_sse2, 2,3,8
lea r2, [r1*3]
VAR_START 0
mova m0, [r0]
mova m1, [r0+r1*2]
mova m3, [r0+r1*4]
mova m4, [r0+r2*2]
lea r0, [r0+r1*8]
VAR_CORE
mova m0, [r0]
mova m1, [r0+r1*2]
mova m3, [r0+r1*4]
mova m4, [r0+r2*2]
VAR_CORE
VAR_END 8, 8
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
cglobal pixel_var_16x16_sse2, 2,3,8
VAR_START 1
mov r2d, 8
......@@ -449,7 +636,7 @@ cglobal pixel_var_16x16_sse2, 2,3,8
VAR_CORE
dec r2d
jg .loop
VAR_END
VAR_END 16, 16
cglobal pixel_var_8x8_sse2, 2,4,8
VAR_START 1
......@@ -465,7 +652,8 @@ cglobal pixel_var_8x8_sse2, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END
VAR_END 8, 8
%endif ; !X264_HIGH_BIT_DEPTH
%macro VAR2_END 0
HADDW m5, m7
......@@ -480,17 +668,22 @@ cglobal pixel_var_8x8_sse2, 2,4,8
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * )
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
cglobal pixel_var2_8x8_mmxext, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, 8
.loop:
%ifdef X264_HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
psubw m0, [r2]
psubw m1, [r2+mmsize]
%else ; !X264_HIGH_BIT_DEPTH
movq m0, [r0]
movq m1, m0
movq m4, m0
movq m2, [r2]
movq m3, m2
punpcklbw m0, m7
......@@ -499,6 +692,7 @@ cglobal pixel_var2_8x8_mmxext, 5,6
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
%endif ; X264_HIGH_BIT_DEPTH
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
......@@ -511,18 +705,24 @@ cglobal pixel_var2_8x8_mmxext, 5,6
jg .loop
VAR2_END
RET
%endif
INIT_XMM
cglobal pixel_var2_8x8_sse2, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
%ifdef X264_HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1*2]
mova m2, [r2]
mova m3, [r2+r3*2]
%else ; !X264_HIGH_BIT_DEPTH
movq m1, [r0]
movhps m1, [r0+r1]
movq m3, [r2]
movhps m3, [r2+r3]
DEINTB 0, 1, 2, 3, 7
%endif ; X264_HIGH_BIT_DEPTH
psubw m0, m2
psubw m1, m3
paddw m5, m0
......@@ -531,13 +731,14 @@ cglobal pixel_var2_8x8_sse2, 5,6,8
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
VAR2_END
RET
%ifndef X264_HIGH_BIT_DEPTH
cglobal pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
......@@ -580,6 +781,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
jg .loop
VAR2_END
RET
%endif ; !X264_HIGH_BIT_DEPTH
;=============================================================================
; SATD
......@@ -697,10 +899,11 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
; out: %1 = satd
%macro SATD_4x4_MMX 3
%xdefine %%n n%1
LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
%assign offset %2*SIZEOF_PIXEL
LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
%if %3
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
......@@ -733,17 +936,23 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
%endmacro
%macro SATD_START_MMX 0
FIX_STRIDES r1, r3
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
%endmacro
%macro SATD_END_MMX 0
%ifdef X264_HIGH_BIT_DEPTH
HADDUW m0, m1
movd eax, m0
%else ; !X264_HIGH_BIT_DEPTH
pshufw m1, m0, 01001110b
paddw m0, m1
pshufw m1, m0, 10110001b
paddw m0, m1
movd eax, m0
and eax, 0xffff
%endif ; X264_HIGH_BIT_DEPTH
RET
%endmacro
......@@ -777,6 +986,35 @@ pixel_satd_8x4_internal_mmxext:
paddw m0, m1
ret
%ifdef X264_HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
cglobal pixel_satd_%1x%2_mmxext, 4,7
SATD_START_MMX
pxor m0, m0
call pixel_satd_%1x%3_internal_mmxext
HADDUW m0, m1
movd r6d, m0
%rep %2/%3-1
pxor m0, m0
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call pixel_satd_%1x%3_internal_mmxext
movd m2, r4
HADDUW m0, m1
movd r4, m0
add r6, r4
movd r4, m2
%endrep
movifnidn eax, r6d
RET
%endmacro
SATD_MxN_MMX 16, 16, 4
SATD_MxN_MMX 16, 8, 4
SATD_MxN_MMX 8, 16, 8
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
cglobal pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX