Commit 8a6a062e authored by Ilia Valiakhmetov's avatar Ilia Valiakhmetov Committed by Fiona Glaser

High bit depth SSE2/AVX add8x8_idct8 and add16x16_idct8

From Google Code-In.
parent a35fd419
......@@ -513,6 +513,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
......@@ -532,6 +534,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
......
......@@ -32,6 +32,8 @@
SECTION .text
cextern pd_32
cextern pw_pixel_max
cextern pw_2
cextern pw_m2
cextern pw_32
......@@ -65,7 +67,7 @@ cextern hsub_mul
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
; in: size, m0..m7, %10,%11,%12
; in: size, m0..m7
; out: 0,4,6 in memory at %10,%11,%12, rest in regs
%macro DCT8_1D 12
SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
......@@ -75,13 +77,13 @@ cextern hsub_mul
SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
mova [%10], m%7
mova [%11], m%6
mova %10, m%7
mova %11, m%6
psra%1 m%7, m%8, 1 ; a3>>1
padd%1 m%7, m%9 ; a2 + (a3>>1)
psra%1 m%9, 1 ; a2>>1
psub%1 m%9, m%8 ; (a2>>1) - a3
mova [%12], m%9
mova %12, m%9
psra%1 m%6, m%4, 1
padd%1 m%6, m%4 ; d25+(d25>>1)
psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
......@@ -109,6 +111,52 @@ cextern hsub_mul
SWAP %3, %5, %4, %7, %9, %6
%endmacro
; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
; out: m0..m7
%macro IDCT8_1D 11
psra%1 m%2, m%4, 1
psra%1 m%6, m%8, 1
psub%1 m%2, m%8
padd%1 m%6, m%4
psra%1 m%8, m%3, 1
padd%1 m%8, m%3
padd%1 m%8, m%5
padd%1 m%8, m%7
psra%1 m%4, m%7, 1
padd%1 m%4, m%7
padd%1 m%4, m%9
psub%1 m%4, m%3
psub%1 m%3, m%5
psub%1 m%7, m%5
padd%1 m%3, m%9
psub%1 m%7, m%9
psra%1 m%5, 1
psra%1 m%9, 1
psub%1 m%3, m%5
psub%1 m%7, m%9
psra%1 m%5, m%8, 2
psra%1 m%9, m%4, 2
padd%1 m%5, m%7
padd%1 m%9, m%3
psra%1 m%7, 2
psra%1 m%3, 2
psub%1 m%8, m%7
psub%1 m%3, m%4
mova m%4, %10
mova m%7, %11
SUMSUB_BA %1, %7, %4
SUMSUB_BA %1, %6, %7
SUMSUB_BA %1, %2, %4
SUMSUB_BA %1, %8, %6
SUMSUB_BA %1, %3, %2
SUMSUB_BA %1, %9, %4
SUMSUB_BA %1, %5, %7
SWAP %2, %4
SWAP %6, %8
SWAP %2, %6, %7
SWAP %4, %9, %8
%endmacro
%ifdef HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
......@@ -118,7 +166,7 @@ global current_function %+ .skip_prologue
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
DCT8_1D w, 0,1,2,3,4,5,6,7, r0,r0+0x10,r0+0x50
DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
mova m0, [r0]
mova [r0+0x30], m5
......@@ -128,7 +176,7 @@ global current_function %+ .skip_prologue
WIDEN_SXWD 1,5
WIDEN_SXWD 2,6
WIDEN_SXWD 3,7
DCT8_1D d, 0,4,1,5,2,6,3,7, r0,r0+0x80,r0+0xC0
DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
mova [r0+0x20], m4
mova [r0+0x40], m1
mova [r0+0x60], m5
......@@ -144,14 +192,14 @@ global current_function %+ .skip_prologue
WIDEN_SXWD 5,1
WIDEN_SXWD 6,2
WIDEN_SXWD 7,3
DCT8_1D d,4,0,5,1,6,2,7,3, r0+0x10,r0+0x90,r0+0xD0
DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
mova [r0+0x30], m0
mova [r0+0x50], m5
mova [r0+0x70], m1
mova [r0+0xB0], m2
mova [r0+0xF0], m3
ret
%endmacro
%endmacro ; SUB8x8_DCT8
INIT_XMM sse2
SUB8x8_DCT8
......@@ -160,53 +208,65 @@ SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%else ; !HIGH_BIT_DEPTH
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2
add r1, 128
global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
mova [r1+0], m4
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
mova m4, [r1+0]
SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2
TRANSPOSE4x4D 4,5,6,7,3
paddd m4, [pd_32]
SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
mova [r1+16], m4
TRANSPOSE4x4D 0,1,2,3,4
mova m4, [r1+16]
mova [r1-112], m0
TRANSPOSE4x4D 4,5,6,7,0
SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7
UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
SPILL_SHUFFLE r1, 7,6,5, 7,6,5
mova m7, [pw_pixel_max]
pxor m6, m6
mova m5, [r1-128]
STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
mova m0, [r1-112]
STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
mova m0, [r1-96]
STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
mova m0, [r1-80]
STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
mova m0, [r1-64]
STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
mova m0, [r1-48]
mova m1, [r1+80]
STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
mova m0, [r1-32]
mova m1, [r1+96]
STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
mova m0, [r1-16]
mova m1, [r1+112]
STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
RET
%endmacro ; ADD8x8_IDCT8
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
psraw m%1, m%3, 1
psraw m%5, m%7, 1
psubw m%1, m%7
paddw m%5, m%3
psraw m%7, m%2, 1
paddw m%7, m%2
paddw m%7, m%4
paddw m%7, m%6
psraw m%3, m%6, 1
paddw m%3, m%6
paddw m%3, m%8
psubw m%3, m%2
psubw m%2, m%4
psubw m%6, m%4
paddw m%2, m%8
psubw m%6, m%8
psraw m%4, 1
psraw m%8, 1
psubw m%2, m%4
psubw m%6, m%8
psraw m%4, m%7, 2
psraw m%8, m%3, 2
paddw m%4, m%6
paddw m%8, m%2
psraw m%6, 2
psraw m%2, 2
psubw m%7, m%6
psubw m%2, m%3
mova m%3, [%9+0x00]
mova m%6, [%9+0x40]
SUMSUB_BA w, %6, %3
SUMSUB_BA w, %5, %6
SUMSUB_BA w, %1, %3
SUMSUB_BA w, %7, %5
SUMSUB_BA w, %2, %1
SUMSUB_BA w, %8, %3
SUMSUB_BA w, %4, %6
SWAP %1, %3
SWAP %5, %7
SWAP %1, %5, %6
SWAP %3, %8, %7
%endmacro
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%else ; !HIGH_BIT_DEPTH
INIT_MMX
ALIGN 16
......@@ -224,7 +284,7 @@ load_diff_4x8_mmx:
ret
cglobal dct8_mmx
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
SAVE_MM_PERMUTATION
ret
......@@ -277,7 +337,7 @@ global sub8x8_dct8_mmx.skip_prologue
ret
cglobal idct8_mmx
IDCT8_1D 0,1,2,3,4,5,6,7,r1
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SAVE_MM_PERMUTATION
ret
......@@ -437,11 +497,11 @@ global current_function %+ .skip_prologue
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%endif
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
SPILL r0, 1,2,3,5,7
ret
%endmacro
......@@ -512,12 +572,12 @@ cglobal add8x8_idct8, 2,2
global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
paddw m0, [pw_32]
SPILL r1, 0
IDCT8_1D 0,1,2,3,4,5,6,7,r1
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
......
......@@ -31,6 +31,8 @@
SECTION .text
cextern pd_32
cextern pw_pixel_max
cextern pw_2
cextern pw_m2
cextern pw_32
......@@ -85,6 +87,56 @@ cextern hsub_mul
SWAP %2, %7, %5, %8, %9, %10
%endmacro
%macro IDCT8_1D 11
SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
psra%1 m%10, m%3, 1
padd%1 m%10, m%3
padd%1 m%10, m%5
padd%1 m%10, m%7 ; %9=a7
psra%1 m%11, m%4, 1
psub%1 m%11, m%8 ; %10=a4
psra%1 m%8, 1
padd%1 m%8, m%4 ; %7=a6
psra%1 m%4, m%7, 1
padd%1 m%4, m%7
padd%1 m%4, m%9
psub%1 m%4, m%3 ; %3=a5
psub%1 m%3, m%5
psub%1 m%7, m%5
padd%1 m%3, m%9
psub%1 m%7, m%9
psra%1 m%5, 1
psra%1 m%9, 1
psub%1 m%3, m%5 ; %2=a3
psub%1 m%7, m%9 ; %6=a1
psra%1 m%5, m%10, 2
padd%1 m%5, m%7 ; %4=b1
psra%1 m%7, 2
psub%1 m%10, m%7 ; %9=b7
SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6
SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
psra%1 m%9, m%4, 2
padd%1 m%9, m%3 ; %8=b3
psra%1 m%3, 2
psub%1 m%3, m%4 ; %2=b5
SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7
SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5
SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4
SWAP %11, %4
SWAP %2, %10, %7
SWAP %4, %9, %8
%endmacro
%ifdef HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
......@@ -130,7 +182,7 @@ global current_function %+ .skip_prologue
mova [r0+0xD0], m7
mova [r0+0xF0], m3
ret
%endmacro
%endmacro ; SUB8x8_DCT8
INIT_XMM sse2
SUB8x8_DCT8
......@@ -139,57 +191,68 @@ SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%else ; !HIGH_BIT_DEPTH
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,16
add r1, 128
%ifdef WIN64
call .skip_prologue
RET
%endif
global current_function %+ .skip_prologue
.skip_prologue:
mova m0, [r1-128]
mova m1, [r1-96]
mova m2, [r1-64]
mova m3, [r1-32]
mova m4, [r1+ 0]
mova m5, [r1+32]
mova m6, [r1+64]
mova m7, [r1+96]
IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
TRANSPOSE4x4D 0,1,2,3,8
TRANSPOSE4x4D 4,5,6,7,8
paddd m0, [pd_32]
paddd m4, [pd_32]
mova [r1+64], m6
mova [r1+96], m7
mova m8, [r1-112]
mova m9, [r1-80]
mova m10, [r1-48]
mova m11, [r1-16]
mova m12, [r1+16]
mova m13, [r1+48]
mova m14, [r1+80]
mova m15, [r1+112]
IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
TRANSPOSE4x4D 8,9,10,11,6
TRANSPOSE4x4D 12,13,14,15,6
IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
mova [r1-112], m8
mova [r1-80], m9
mova m6, [r1+64]
mova m7, [r1+96]
IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
pxor m8, m8
mova m9, [pw_pixel_max]
STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
mova m0, [r1-112]
mova m1, [r1-80]
STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
ret
%endmacro ; ADD8x8_IDCT8
%macro IDCT8_1D 10
SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
psraw m%9, m%2, 1
paddw m%9, m%2
paddw m%9, m%4
paddw m%9, m%6 ; %9=a7
psraw m%10, m%3, 1
psubw m%10, m%7 ; %10=a4
psraw m%7, 1
paddw m%7, m%3 ; %7=a6
psraw m%3, m%6, 1
paddw m%3, m%6
paddw m%3, m%8
psubw m%3, m%2 ; %3=a5
psubw m%2, m%4
psubw m%6, m%4
paddw m%2, m%8
psubw m%6, m%8
psraw m%4, 1
psraw m%8, 1
psubw m%2, m%4 ; %2=a3
psubw m%6, m%8 ; %6=a1
psraw m%4, m%9, 2
paddw m%4, m%6 ; %4=b1
psraw m%6, 2
psubw m%9, m%6 ; %9=b7
SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6
SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4
psraw m%8, m%3, 2
paddw m%8, m%2 ; %8=b3
psraw m%2, 2
psubw m%2, m%3 ; %2=b5
SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7
SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5
SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4
SWAP %10, %3
SWAP %1, %9, %6
SWAP %3, %8, %7
%endmacro
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%else ; !HIGH_BIT_DEPTH
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3,10
......@@ -282,10 +345,10 @@ global current_function %+ .skip_prologue
movdqa m5, [r1+0x50]
movdqa m6, [r1+0x60]
movdqa m7, [r1+0x70]
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32] ; rounding for the >>6 at the end
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
......
......@@ -395,12 +395,15 @@ cglobal %1, 3,3,%7
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
%ifdef HIGH_BIT_DEPTH
cglobal %1, 2,2,6
cglobal %1, 2,2,%7
%if %3==256
add r1, 128
%endif
%else
cglobal %1, 2,2,11
pxor m7, m7
%endif
%if mmsize==16
%if mmsize==16 && %3!=256
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
......@@ -430,10 +433,14 @@ INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
cextern add8x8_idct8_sse2.skip_prologue
cextern add8x8_idct8_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_sse4.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
......
......@@ -86,10 +86,10 @@ void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_avx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_sse2 ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
......
......@@ -765,13 +765,24 @@
packuswb %2, %1
%endmacro
%macro STORE_DIFF 4
; (high depth) in: %1, %2, min to clip, max to clip, mem128
; in: %1, tmp, %3, mem64
%macro STORE_DIFF 4-5
%ifdef HIGH_BIT_DEPTH
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddw %1, %5
CLIPW %1, %3, %4
mova %5, %1
%else
movh %2, %4
punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endif
%endmacro
%macro SHUFFLE_MASK_W 8
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment