Commit a35fd419 authored by Edward Wang's avatar Edward Wang Committed by Fiona Glaser

MMX/SSE2/AVX predict_8x16_p, high bit depth fdct8

From Google Code-In.
parent 9301bbd3
......@@ -509,6 +509,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
......@@ -516,11 +518,18 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_SSE4 )
{
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
}
if( cpu&X264_CPU_AVX )
{
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
......
......@@ -37,52 +37,131 @@ cextern pw_m2
cextern pw_32
cextern hsub_mul
%ifndef HIGH_BIT_DEPTH
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova [%%base + %2*16], %%tmp
%rotate 1-%0/2
%endrep
%endmacro
%macro UNSPILL_SHUFFLE 3-*
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova %%tmp, [%%base + %2*16]
%rotate 1-%0/2
%endrep
%endmacro
%macro SPILL 2+ ; assume offsets are the same as reg numbers
SPILL_SHUFFLE %1, %2, %2
%endmacro
%macro UNSPILL 2+
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
; in: size, m0..m7, %10,%11,%12
; out: 0,4,6 in memory at %10,%11,%12, rest in regs
%macro DCT8_1D 12
SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
mova [%10], m%7
mova [%11], m%6
psra%1 m%7, m%8, 1 ; a3>>1
padd%1 m%7, m%9 ; a2 + (a3>>1)
psra%1 m%9, 1 ; a2>>1
psub%1 m%9, m%8 ; (a2>>1) - a3
mova [%12], m%9
psra%1 m%6, m%4, 1
padd%1 m%6, m%4 ; d25+(d25>>1)
psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
psub%1 m%8, m%6
psra%1 m%6, m%3, 1
padd%1 m%6, m%3 ; d16+(d16>>1)
padd%1 m%9, m%2, m%5
psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
psra%1 m%6, m%2, 1
padd%1 m%6, m%2 ; d07+(d07>>1)
padd%1 m%6, m%3
padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
psra%1 m%2, m%5, 1
padd%1 m%2, m%5 ; d34+(d34>>1)
padd%1 m%2, m%3
psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
psra%1 m%5, m%2, 2
padd%1 m%5, m%6 ; a4 + (a7>>2)
psra%1 m%4, m%9, 2
padd%1 m%4, m%8 ; a5 + (a6>>2)
psra%1 m%6, 2
psra%1 m%8, 2
psub%1 m%6, m%2 ; (a4>>2) - a7
psub%1 m%9, m%8 ; a6 - (a5>>2)
SWAP %3, %5, %4, %7, %9, %6
%endmacro
%ifdef HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,8
global current_function %+ .skip_prologue
.skip_prologue:
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
DCT8_1D w, 0,1,2,3,4,5,6,7, r0,r0+0x10,r0+0x50
mova m0, [r0]
mova [r0+0x30], m5
mova [r0+0x70], m7
TRANSPOSE4x4W 0,1,2,3,4
WIDEN_SXWD 0,4
WIDEN_SXWD 1,5
WIDEN_SXWD 2,6
WIDEN_SXWD 3,7
DCT8_1D d, 0,4,1,5,2,6,3,7, r0,r0+0x80,r0+0xC0
mova [r0+0x20], m4
mova [r0+0x40], m1
mova [r0+0x60], m5
mova [r0+0xA0], m6
mova [r0+0xE0], m7
mova m4, [r0+0x10]
mova m5, [r0+0x30]
mova m6, [r0+0x50]
mova m7, [r0+0x70]
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07
SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16
SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25
SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34
SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2
SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3
SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4
mova [%9+0x00], m%6
mova [%9+0x40], m%5
psraw m%6, m%7, 1 ; a3>>1
paddw m%6, m%8 ; a2 + (a3>>1)
psraw m%8, 1 ; a2>>1
psubw m%8, m%7 ; (a2>>1) - a3
mova [%9+0x60], m%8
psraw m%5, m%3, 1
paddw m%5, m%3 ; d25+(d25>>1)
psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
psubw m%7, m%5
psraw m%5, m%2, 1
paddw m%5, m%2 ; d16+(d16>>1)
paddw m%8, m%1, m%4
psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
psraw m%5, m%1, 1
paddw m%5, m%1 ; d07+(d07>>1)
paddw m%5, m%2
paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
psraw m%1, m%4, 1
paddw m%1, m%4 ; d34+(d34>>1)
paddw m%1, m%2
psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
psraw m%4, m%1, 2
paddw m%4, m%5 ; a4 + (a7>>2)
psraw m%3, m%8, 2
paddw m%3, m%7 ; a5 + (a6>>2)
psraw m%5, 2
psraw m%7, 2
psubw m%5, m%1 ; (a4>>2) - a7
psubw m%8, m%7 ; a6 - (a5>>2)
SWAP %2, %4, %3, %6, %8, %5
TRANSPOSE4x4W 4,5,6,7,0
WIDEN_SXWD 4,0
WIDEN_SXWD 5,1
WIDEN_SXWD 6,2
WIDEN_SXWD 7,3
DCT8_1D d,4,0,5,1,6,2,7,3, r0+0x10,r0+0x90,r0+0xD0
mova [r0+0x30], m0
mova [r0+0x50], m5
mova [r0+0x70], m1
mova [r0+0xB0], m2
mova [r0+0xF0], m3
ret
%endmacro
INIT_XMM sse2
SUB8x8_DCT8
INIT_XMM sse4
SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%else ; !HIGH_BIT_DEPTH
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
......@@ -145,38 +224,10 @@ load_diff_4x8_mmx:
ret
cglobal dct8_mmx
DCT8_1D 0,1,2,3,4,5,6,7,r0
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
SAVE_MM_PERMUTATION
ret
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova [%%base + %2*16], %%tmp
%rotate 1-%0/2
%endrep
%endmacro
%macro UNSPILL_SHUFFLE 3-*
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova %%tmp, [%%base + %2*16]
%rotate 1-%0/2
%endrep
%endmacro
%macro SPILL 2+ ; assume offsets are the same as reg numbers
SPILL_SHUFFLE %1, %2, %2
%endmacro
%macro UNSPILL 2+
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
......@@ -386,11 +437,11 @@ global current_function %+ .skip_prologue
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%endif
DCT8_1D 0,1,2,3,4,5,6,7,r0
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
DCT8_1D 0,1,2,3,4,5,6,7,r0
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
SPILL r0, 1,2,3,5,7
ret
%endmacro
......
......@@ -36,55 +36,111 @@ cextern pw_m2
cextern pw_32
cextern hsub_mul
%ifndef HIGH_BIT_DEPTH
; in: size, m0..m7, temp, temp
; out: m0..m7
%macro DCT8_1D 11
SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
%macro DCT8_1D 10
SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
psra%1 m%10, m%2, 1
padd%1 m%10, m%2
padd%1 m%10, m%3
padd%1 m%10, m%4 ; %10=a4
psraw m%9, m%1, 1
paddw m%9, m%1
paddw m%9, m%2
paddw m%9, m%3 ; %9=a4
psraw m%10, m%4, 1
paddw m%10, m%4
paddw m%10, m%2
psubw m%10, m%3 ; %10=a7
SUMSUB_BA w, %4, %1
psubw m%1, m%3
psubw m%4, m%2
psraw m%3, 1
psraw m%2, 1
psubw m%1, m%3 ; %1=a5
psubw m%4, m%2 ; %4=a6
psraw m%2, m%10, 2
paddw m%2, m%9 ; %2=b1
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
psraw m%3, m%7, 1
paddw m%3, m%8 ; %3=b2
psraw m%8, 1
psubw m%8, m%7 ; %8=b6
psra%1 m%11, m%5, 1
padd%1 m%11, m%5
padd%1 m%11, m%3
psub%1 m%11, m%4 ; %11=a7
SUMSUB_BA %1, %5, %2
psub%1 m%2, m%4
psub%1 m%5, m%3
psra%1 m%4, 1
psra%1 m%3, 1
psub%1 m%2, m%4 ; %2=a5
psub%1 m%5, m%3 ; %5=a6
psra%1 m%3, m%11, 2
padd%1 m%3, m%10 ; %3=b1
psra%1 m%10, 2
psub%1 m%10, m%11 ; %10=b7
SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
psra%1 m%4, m%8, 1
padd%1 m%4, m%9 ; %4=b2
psra%1 m%9, 1
psub%1 m%9, m%8 ; %9=b6
psraw m%7, m%4, 2
paddw m%7, m%1 ; %7=b3
psraw m%1, 2
psubw m%4, m%1 ; %4=b5
psra%1 m%8, m%5, 2
padd%1 m%8, m%2 ; %8=b3
psra%1 m%2, 2
psub%1 m%5, m%2 ; %5=b5
SWAP %1, %6, %4, %7, %8, %9
SWAP %2, %7, %5, %8, %9, %10
%endmacro
%ifdef HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,14
%ifdef WIN64
call .skip_prologue
RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
TRANSPOSE4x4W 0,1,2,3,8
WIDEN_SXWD 0,8
WIDEN_SXWD 1,9
WIDEN_SXWD 2,10
WIDEN_SXWD 3,11
DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
mova [r0+0x00], m0
mova [r0+0x20], m8
mova [r0+0x40], m1
mova [r0+0x60], m9
mova [r0+0x80], m2
mova [r0+0xA0], m10
mova [r0+0xC0], m3
mova [r0+0xE0], m11
TRANSPOSE4x4W 4,5,6,7,0
WIDEN_SXWD 4,0
WIDEN_SXWD 5,1
WIDEN_SXWD 6,2
WIDEN_SXWD 7,3
DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
mova [r0+0x10], m4
mova [r0+0x30], m0
mova [r0+0x50], m5
mova [r0+0x70], m1
mova [r0+0x90], m6
mova [r0+0xB0], m2
mova [r0+0xD0], m7
mova [r0+0xF0], m3
ret
%endmacro
INIT_XMM sse2
SUB8x8_DCT8
INIT_XMM sse4
SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%else ; !HIGH_BIT_DEPTH
%macro IDCT8_1D 10
SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
......@@ -136,7 +192,7 @@ cextern hsub_mul
%endmacro
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3,11
cglobal sub8x8_dct, 3,3,10
add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
mova m7, [hsub_mul]
......@@ -177,9 +233,9 @@ global current_function %+ .skip_prologue
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
DCT8_1D 0,1,2,3,4,5,6,7,8,9
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D 0,1,2,3,4,5,6,7,8,9
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
......
......@@ -355,8 +355,8 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
%macro SUB_NxN_DCT 7
cglobal %1, 3,3,%7
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
......@@ -427,24 +427,30 @@ cglobal %1, 2,2,11
%ifdef HIGH_BIT_DEPTH
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_sse4.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif
......@@ -453,10 +459,10 @@ cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
cextern sub8x8_dct_xop.skip_prologue
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
......@@ -471,9 +477,9 @@ ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
......
......@@ -74,12 +74,14 @@ void x264_idct4x4dc_avx ( int32_t d[16] );
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_avx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_avx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 );
void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
......
......@@ -34,6 +34,7 @@ pw_76543210:
pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
pw_m7: times 8 dw -7
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
......@@ -1079,36 +1080,42 @@ PREDICT_8x8_VR b
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
cglobal predict_8x8c_p_core_mmx2, 1,2
%ifndef HIGH_BIT_DEPTH
%macro PREDICT_CHROMA_P_MMX 1
cglobal predict_8x%1c_p_core, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
mov r1d, 8
movq m1, m2
pmullw m2, [pw_3210]
psllw m1, 2
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
mov r1d, %1
ALIGN 4
.loop:
movq mm5, mm0
movq mm6, mm1
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
movq [r0], mm5
movq m5, m0
movq m6, m1
psraw m5, 5
psraw m6, 5
packuswb m5, m6
movq [r0], m5
paddsw mm0, mm4
paddsw mm1, mm4
paddsw m0, m4
paddsw m1, m4
add r0, FDEC_STRIDE
dec r1d
jg .loop
jg .loop
REP_RET
%endmacro ; PREDICT_CHROMA_P_MMX
INIT_MMX mmx2
PREDICT_CHROMA_P_MMX 8
PREDICT_CHROMA_P_MMX 16
%endif ; !HIGH_BIT_DEPTH
%endif ; !ARCH_X86_64
%macro PREDICT_8x8C 0
%macro PREDICT_CHROMA_P_XMM 1
%ifdef HIGH_BIT_DEPTH
cglobal predict_8x8c_p_core, 1,1,7
cglobal predict_8x%1c_p_core, 1,2,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
......@@ -1118,9 +1125,13 @@ cglobal predict_8x8c_p_core, 1,1,7
SPLATW m2, m2, 0
SPLATW m4, m4, 0
pmullw m2, [pw_43210123] ; b<