Commit 7496fc4a authored by Aaron Schmitz's avatar Aaron Schmitz Committed by Fiona Glaser

Some MBAFF x86 assembly functions.

deblock_chroma_420_mbaff, plus 422/422_intra_mbaff implemented using existing functions.
From Google Code-In.
parent b8d7b8ac
......@@ -167,10 +167,6 @@ static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int bet
{
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
......@@ -265,10 +261,6 @@ static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, i
{
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
......@@ -647,6 +639,8 @@ void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, in
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
......@@ -660,6 +654,7 @@ void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int be
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
......@@ -672,15 +667,21 @@ void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
#if ARCH_X86
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
......@@ -727,10 +728,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
......@@ -741,14 +740,17 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
......@@ -757,6 +759,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
......@@ -766,6 +769,9 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
#if HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
#endif
}
}
if( cpu&X264_CPU_SSSE3 )
......@@ -774,9 +780,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_strength = x264_deblock_strength_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
#endif
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
......@@ -786,6 +791,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
#if HIGH_BIT_DEPTH
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
#endif
}
}
}
......@@ -810,4 +819,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
}
#endif
#endif // !HIGH_BIT_DEPTH
/* These functions are equivalent, so don't duplicate them. */
pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
}
......@@ -39,10 +39,10 @@ void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
......
......@@ -1675,7 +1675,6 @@ DEBLOCK_LUMA_INTRA v8
%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
......@@ -1726,7 +1725,6 @@ cglobal deblock_h_chroma, 5,7,8
cglobal deblock_intra_body
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
......@@ -1770,18 +1768,61 @@ cglobal deblock_h_chroma_intra, 4,6,8
dec r4
jg .loop
REP_RET
%endmacro
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
add r1, r1
%if mmsize == 8
mov r4, 16/mmsize
.loop:
%else
lea r5, [r1*3]
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
CHROMA_H_LOAD r5
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r5
%if mmsize == 8
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
%endif
REP_RET
%macro DEBLOCK_H_CHROMA_422_INTRA_10 0
;-----------------------------------------------------------------------------
; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_mbaff, 5,7,8
add r1, r1
lea r6, [r1*3]
%if mmsize == 8
mov r5, 16/mmsize
.loop:
%endif
CHROMA_H_LOAD r6
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
movd m6, [r4]
punpcklbw m6, m6
psraw m6, 8
punpcklwd m6, m6
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r6
%if mmsize == 8
lea r0, [r0+r1*(mmsize/4)]
add r4, mmsize/4
dec r5
jg .loop
%endif
REP_RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422_intra, 4,6,8
add r1, r1
mov r4, 64/mmsize
......@@ -1796,19 +1837,17 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
dec r4
jg .loop
REP_RET
%endmacro
INIT_XMM sse2
DEBLOCK_H_CHROMA_422_INTRA_10
%macro DEBLOCK_H_CHROMA_422_10 0
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422, 5,7,8
add r1, r1
mov r5, 64/mmsize
lea r6, [r1*3]
.loop:
CHROMA_H_LOAD r6
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2m, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
movd m6, [r4-1]
......@@ -1819,13 +1858,26 @@ cglobal deblock_h_chroma_422, 5,7,8
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r6
lea r0, [r0+r1*(mmsize/4)]
add r4, mmsize/16
%if mmsize == 16
inc r4
%else
mov r2, r5
and r2, 1
add r4, r2 ; increment once every 2 iterations
%endif
dec r5
jg .loop
REP_RET
%endmacro
%endmacro ; DEBLOCK_CHROMA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_H_CHROMA_422_10
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
......@@ -1927,6 +1979,34 @@ INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
;-----------------------------------------------------------------------------
; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
cglobal deblock_h_chroma_mbaff, 5,7,8
dec r2d
dec r3d
sub r0, 4
lea t6, [r1*3]
mov t5, r0
add r0, t6
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
RET
%endmacro
INIT_XMM sse2
DEBLOCK_H_CHROMA_420_MBAFF
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_H_CHROMA_420_MBAFF
%endif
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8
%ifdef ARCH_X86_64
......@@ -1978,7 +2058,7 @@ DEBLOCK_H_CHROMA_422
%define t5 r4
%define t6 r5
%macro DEBLOCK_CHROMA_INTRA 0
%macro DEBLOCK_CHROMA_INTRA_BODY 0
cglobal chroma_intra_body
LOAD_MASK r2d, r3d
mova m5, m1
......@@ -1992,7 +2072,9 @@ cglobal chroma_intra_body
paddb m1, m5
paddb m2, m6
ret
%endmacro
%macro DEBLOCK_CHROMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
......@@ -2022,18 +2104,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 0
RET
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
DEBLOCK_CHROMA_INTRA
INIT_XMM avx
DEBLOCK_CHROMA_INTRA
%ifndef ARCH_X86_64
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA
%endif
%macro DEBLOCK_H_CHROMA_422_INTRA 0
cglobal deblock_h_chroma_422_intra, 4,7,8
CHROMA_H_START
mov r6d, 32/mmsize
......@@ -2046,13 +2117,30 @@ cglobal deblock_h_chroma_422_intra, 4,7,8
dec r6d
jg .loop
REP_RET
%endmacro
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
DEBLOCK_H_CHROMA_422_INTRA
%ifndef ARCH_X86_64
DEBLOCK_CHROMA_INTRA_BODY
DEBLOCK_CHROMA_INTRA
INIT_XMM avx
DEBLOCK_CHROMA_INTRA_BODY
DEBLOCK_CHROMA_INTRA
INIT_MMX mmx2
DEBLOCK_H_CHROMA_422_INTRA
DEBLOCK_CHROMA_INTRA_BODY
%ifndef ARCH_X86_64
DEBLOCK_CHROMA_INTRA
%endif
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
RET
%endif ; !HIGH_BIT_DEPTH
......
......@@ -1423,8 +1423,8 @@ cglobal predict_8x8_vl, 2,2,8
mova [r0-2*FDEC_STRIDEB], m3
PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
mova [r0+0*FDEC_STRIDEB], m3
PALIGNR m3, m7, m6, SIZEOF_PIXEL*3, m5
mova [r0+2*FDEC_STRIDEB], m3
PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
mova [r0+2*FDEC_STRIDEB], m7
PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
PSLLPIX m5, m0, 1
PRED8x8_LOWPASS m0, m5, m2, m0, m7
......@@ -1435,8 +1435,8 @@ cglobal predict_8x8_vl, 2,2,8
mova [r0-1*FDEC_STRIDEB], m4
PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
mova [r0+1*FDEC_STRIDEB], m4
PALIGNR m4, m1, m0, SIZEOF_PIXEL*4, m2
mova [r0+3*FDEC_STRIDEB], m4
PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
mova [r0+3*FDEC_STRIDEB], m1
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
......
......@@ -1565,11 +1565,15 @@ static int check_deblock( int cpu_ref, int cpu_new )
TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_luma_intra[0], 0 );
TEST_DEBLOCK( deblock_luma_intra[1], 1 );
TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
if( db_a.deblock_strength != db_ref.deblock_strength )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment