Commit 03c61538 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser
Browse files

x86 asm for high-bit-depth DCT

Only MMX and DCT done so far; iDCT still needs asm as well.
~4.4x faster than C.
parent 515d560f
......@@ -421,7 +421,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->dct4x4dc = dct4x4dc;
dctf->idct4x4dc = idct4x4dc;
#if !X264_HIGH_BIT_DEPTH
#if X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
}
#endif // HAVE_MMX
#else // !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -519,7 +528,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
#endif // X264_HIGH_BIT_DEPTH
}
void x264_dct_init_weights( void )
......
......@@ -114,6 +114,27 @@ cglobal idct4x4dc_mmx, 1,1
movq [r0+24], m3
RET
%ifdef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( int32_t dct[4][4], uint16_t *pix1, uint16_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub4x4_dct_mmx, 3,3
.skip_prologue:
LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
DCT4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
DCT4_1D 0,1,2,3,4
STORE_DIFF m0, m4, m5, [r0+ 0], [r0+ 8]
STORE_DIFF m1, m4, m5, [r0+16], [r0+24]
STORE_DIFF m2, m4, m5, [r0+32], [r0+40]
STORE_DIFF m3, m4, m5, [r0+48], [r0+56]
RET
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
......@@ -141,7 +162,9 @@ cglobal sub4x4_dct_%1, 3,3
SUB_DCT4 mmx
SUB_DCT4 ssse3
%endif ; !X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
......@@ -213,19 +236,22 @@ cglobal add4x4_idct_sse4, 2,2,6
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
%endif ; !X264_HIGH_BIT_DEPTH
INIT_MMX
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
cglobal %1, 3,3,11*(mmsize/16)
%ifndef X264_HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
%else
add r2, 4*FDEC_STRIDE
mova m7, [hsub_mul]
%endif
%endif ; !X264_HIGH_BIT_DEPTH
.skip_prologue:
%ifdef WIN64
sub rsp, 8
......@@ -255,7 +281,7 @@ cglobal %1, 3,3,11
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
cglobal %1, 2,2,11
cglobal %1, 2,2,11*(mmsize/16)
pxor m7, m7
%if mmsize==16
add r0, 4*FDEC_STRIDE
......@@ -282,6 +308,11 @@ cglobal %1, 2,2,11
%endif
%endmacro
%ifdef X264_HIGH_BIT_DEPTH
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
%else ; !X264_HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
......@@ -310,6 +341,7 @@ ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
cextern sub8x8_dct8_ssse3.skip_prologue
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
%endif ; X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
......
......@@ -28,9 +28,9 @@
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
void x264_sub4x4_dct_mmx ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_mmx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_mmx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
......
......@@ -471,7 +471,10 @@
%macro LOAD_DIFF 5
%ifidn %3, none
%ifdef X264_HIGH_BIT_DEPTH
mova %1, %4
psubw %1, %5
%elifidn %3, none
movh %1, %4
movh %2, %5
punpcklbw %1, %2
......@@ -557,6 +560,16 @@
packuswb %2, %1
%endmacro
%ifdef X264_HIGH_BIT_DEPTH
%macro STORE_DIFF 5
punpcklwd %2, %1
punpckhwd %3, %1
psrad %2, 16
psrad %3, 16
mova %4, %2
mova %5, %3
%endmacro
%else
%macro STORE_DIFF 4
movh %2, %4
punpcklbw %2, %3
......@@ -565,6 +578,7 @@
packuswb %1, %1
movh %4, %1
%endmacro
%endif
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment