Commit de7aed78 authored by Matt Habel's avatar Matt Habel Committed by Fiona Glaser

High bit depth intra_sad_x3_8x8, intra_satd_x3_4x4/8x8c/16x16

Also add an ACCUM macro to handle accumulator-induced add-or-swap more concisely.
parent d9dee734
......@@ -522,8 +522,6 @@ INTRA_MBCMP_8x8( sad,, _c )
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmx2, _c )
INTRA_MBCMP_8x8( sad, _sse2, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3, _sse2 )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
......@@ -550,11 +548,8 @@ INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#if HAVE_MMX
#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP(satd, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP(satd, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
......@@ -872,6 +867,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
......
......@@ -1437,15 +1437,9 @@ ZIGZAG_SUB_4x4 ac, field
mova [r0+(%1+64)*SIZEOF_PIXEL], m2
mova [r0+(%1+96)*SIZEOF_PIXEL], m3
packsswb m0, m1
%if %1
por m6, m2
por m7, m3
por m5, m0
%else
SWAP 5, 0
SWAP 6, 2
SWAP 7, 3
%endif
ACCUM por, 6, 2, %1
ACCUM por, 7, 3, %1
ACCUM por, 5, 0, %1
%endmacro
%macro ZIGZAG_8x8_CAVLC 1
......
......@@ -130,7 +130,7 @@ cextern pb_1
cextern pw_1
cextern pw_8
cextern pw_16
cextern pw_64
cextern pw_32
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
......@@ -1794,6 +1794,12 @@ cglobal intra_sa8d_x3_8x8, 3,3,14
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
%ifdef HIGH_BIT_DEPTH
mova m0, [r0+0*FENC_STRIDEB]
mova m1, [r0+1*FENC_STRIDEB]
mova m2, [r0+2*FENC_STRIDEB]
mova m3, [r0+3*FENC_STRIDEB]
%else
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
......@@ -1803,24 +1809,31 @@ cglobal hadamard_load
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
%endif
HADAMARD4_2D 0, 1, 2, 3, 4
SAVE_MM_PERMUTATION
ret
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
movd %3, [r1+%2-FDEC_STRIDE]
%ifdef HIGH_BIT_DEPTH
mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
%else
movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
pxor %5, %5
punpcklbw %3, %5
%endif
%else ; left
%ifnidn %2, 0
shl %2d, 5 ; log(FDEC_STRIDE)
shl %2d, 5 ; log(FDEC_STRIDEB)
%endif
movd %3, [r1+%2-4+1*FDEC_STRIDE]
pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0
pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2
pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3
movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
%ifndef HIGH_BIT_DEPTH
psrlw %3, 8
%endif
%ifnidn %2, 0
shr %2d, 5
%endif
......@@ -1859,19 +1872,6 @@ cglobal hadamard_load
%8 %3, %6
%endmacro
%macro CLEAR_SUMS 0
%ifdef ARCH_X86_64
mov qword [sums+0], 0
mov qword [sums+8], 0
mov qword [sums+16], 0
%else
pxor m7, m7
movq [sums+0], m7
movq [sums+8], m7
movq [sums+16], m7
%endif
%endmacro
; in: m1..m3
; out: m7
; clobber: m4..m6
......@@ -1954,33 +1954,43 @@ cglobal intra_satd_x3_4x4, 3,3
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_16x16, 0,5
%assign stack_pad 88 + ((stack_offset+88+gprsize)&15)
%assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, stack_pad
%define sums rsp+64 ; size 24
%define sums rsp+64 ; size 56
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
movifnidn r1, r1mp
CLEAR_SUMS
pxor m7, m7
mova [sums+ 0], m7
mova [sums+ 8], m7
mova [sums+16], m7
%ifdef HIGH_BIT_DEPTH
mova [sums+24], m7
mova [sums+32], m7
mova [sums+40], m7
mova [sums+48], m7
%endif
; 1D hadamards
mov t0d, 12
movd m6, [pw_64]
mov t0d, 12
movd m6, [pw_32]
.loop_edge:
SCALAR_HADAMARD left, t0, m0, m1
SCALAR_HADAMARD top, t0, m1, m2, m3
paddw m6, m0
paddw m6, m1
sub t0d, 4
pavgw m0, m1
paddw m6, m0
sub t0d, 4
jge .loop_edge
psrlw m6, 3
pand m6, [sw_f0] ; dc
psrlw m6, 2
pand m6, [sw_f0] ; dc
; 2D hadamards
movifnidn r0, r0mp
mov r3, -4
movifnidn r0, r0mp
mov r3, -4
.loop_y:
mov r4, -4
mov r4, -4
.loop_x:
call hadamard_load
......@@ -1988,36 +1998,66 @@ cglobal intra_satd_x3_16x16, 0,5
SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
pavgw m4, m7
pavgw m5, m7
paddw m0, [sums+0] ; i16x16_v satd
paddw m4, [sums+8] ; i16x16_h satd
paddw m0, [sums+ 0] ; i16x16_v satd
paddw m4, [sums+ 8] ; i16x16_h satd
paddw m5, [sums+16] ; i16x16_dc satd
movq [sums+0], m0
movq [sums+8], m4
movq [sums+16], m5
mova [sums+ 0], m0
mova [sums+ 8], m4
mova [sums+16], m5
add r0, 4
add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
add r0, 4*FENC_STRIDE-16
%ifdef HIGH_BIT_DEPTH
mova m7, [pw_1]
pmaddwd m4, m7
pmaddwd m0, m7
paddd m4, [sums+32]
paddd m0, [sums+24]
mova [sums+32], m4
mova [sums+24], m0
pxor m7, m7
punpckhwd m3, m5, m7
punpcklwd m5, m7
paddd m3, [sums+48]
paddd m5, [sums+40]
mova [sums+48], m3
mova [sums+40], m5
mova [sums+ 0], m7
mova [sums+ 8], m7
mova [sums+16], m7
%endif
add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
inc r3
jl .loop_y
; horizontal sum
movifnidn r2, r2mp
movq m2, [sums+16]
movq m1, [sums+8]
movq m0, [sums+0]
movq m7, m2
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
%ifdef HIGH_BIT_DEPTH
mova m1, m5
paddd m5, m3
HADDD m5, m7 ; DC satd
HADDD m4, m7 ; H satd
HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
psrld m0, 1
psrlq m1, 32 ; DC[1]
paddd m0, m3 ; DC[2]
psrlq m3, 32 ; DC[3]
paddd m0, m1
paddd m0, m3
%else
mova m7, m5
SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
psrld m0, 1
pslld m7, 16
psrld m7, 16
paddd m0, m2
paddd m0, m5
psubd m0, m7
movd [r2+8], m2 ; i16x16_dc satd
movd [r2+4], m1 ; i16x16_h satd
movd [r2+0], m0 ; i16x16_v satd
ADD rsp, stack_pad
%endif
movd [r2+8], m5 ; i16x16_dc satd
movd [r2+4], m4 ; i16x16_h satd
movd [r2+0], m0 ; i16x16_v satd
ADD rsp, stack_pad
RET
;-----------------------------------------------------------------------------
......@@ -2031,7 +2071,10 @@ cglobal intra_satd_x3_8x8c, 0,6
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
movifnidn r1, r1mp
CLEAR_SUMS
pxor m7, m7
mova [sums+ 0], m7
mova [sums+ 8], m7
mova [sums+16], m7
; 1D hadamards
mov t0d, 4
......@@ -2082,10 +2125,10 @@ cglobal intra_satd_x3_8x8c, 0,6
movq [sums+8], m4
movq [sums+0], m5
add r0, 4
add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
add r0, 4*FENC_STRIDE-8
add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
add r5, 8
inc r3
jl .loop_y
......@@ -2095,10 +2138,18 @@ cglobal intra_satd_x3_8x8c, 0,6
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
%ifdef HIGH_BIT_DEPTH
psrlq m7, 16
HADDW m7, m3
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
paddd m2, m7
%else
psrlq m7, 15
paddw m2, m7
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
%endif
movd [r2+0], m0 ; i8x8c_dc satd
movd [r2+4], m1 ; i8x8c_h satd
movd [r2+8], m2 ; i8x8c_v satd
......@@ -3717,9 +3768,9 @@ SA8D
SATDS_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
%endif
INIT_MMX mmx2
INTRA_X3_MMX
%endif
INIT_XMM sse2
HADAMARD_AC_SSE2
......@@ -3808,13 +3859,8 @@ HADAMARD_AC_SSE2
pmaddwd m7, m5, m6
pmaddwd m5, m5
pmaddwd m6, m6
%if %1==0
SWAP 3, 5
SWAP 4, 7
%else
paddd m3, m5
paddd m4, m7
%endif
ACCUM paddd, 3, 5, %1
ACCUM paddd, 4, 7, %1
paddd m3, m6
%endmacro
......
......@@ -112,6 +112,7 @@ void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
......
......@@ -138,11 +138,7 @@ cextern pd_1024
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP 5, 1
%endif
ACCUM por, 5, 1, %4
%else ; !sse4
mova m0, [%1]
ABSD m1, m0
......@@ -156,11 +152,7 @@ cextern pd_1024
psrld m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP 5, 1
%endif
ACCUM por, 5, 1, %4
%endif ; cpuflag
%endmacro
......@@ -180,11 +172,7 @@ cextern pd_1024
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
%if %4
por m5, m2
%else
SWAP 5, 2
%endif
ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_DC %1, %2, %3, %4
......@@ -208,11 +196,7 @@ cextern pd_1024
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP 5, 1
%endif
ACCUM por, 5, 1, %4
%endmacro
%macro QUANT_TWO_AC 4
......@@ -231,11 +215,7 @@ cextern pd_1024
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
%if %4
por m5, m2
%else
SWAP 5, 2
%endif
ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
QUANT_ONE_AC_MMX %1, %2, %3, %4
......@@ -307,11 +287,7 @@ QUANT_AC 8, 8
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
%if %4
por m5, m0
%else
SWAP 5, 0
%endif
ACCUM por, 5, 0, %4
%endmacro
%macro QUANT_TWO 7
......@@ -327,13 +303,8 @@ QUANT_AC 8, 8
PSIGNW m2, m3
mova %1, m0
mova %2, m2
%if %7
por m5, m0
ACCUM por, 5, 0, %7
por m5, m2
%else
SWAP 5, 0
por m5, m2
%endif
%endmacro
;-----------------------------------------------------------------------------
......
......@@ -242,11 +242,7 @@ SAD_W16
psadbw m1, m3
psadbw m2, m4
lea r2, [r2+2*r3]
%if %1
paddw m0, m1
%else
SWAP 0, 1
%endif
ACCUM paddw, 0, 1, %1
paddw m0, m2
%endmacro
......@@ -391,25 +387,13 @@ cglobal intra_sad_x3_4x4_mmx2, 3,3
movq m5, [r0+FENC_STRIDE*%1]
movq m4, m5
psadbw m4, m0
%if %1
paddw m1, m4
%else
SWAP 1, 4
%endif
ACCUM paddw, 1, 4, %1
movq m4, m5
psadbw m4, m6
%if %1
paddw m2, m4
%else
SWAP 2, 4
%endif
ACCUM paddw, 2, 4, %1
pshufw m4, m7, %2
psadbw m5, m4
%if %1
paddw m3, m5
%else
SWAP 3, 5
%endif
ACCUM paddw, 3, 5, %1
%endmacro
INIT_MMX
......@@ -467,13 +451,8 @@ cglobal intra_sad_x3_8x8_mmx2, 3,3
psadbw m5, m6
paddw m1, m3
paddw m4, m5
%if %1
paddw m0, m1
paddw m2, m4
%else
SWAP 0,1
SWAP 2,4
%endif
ACCUM paddw, 0, 1, %1
ACCUM paddw, 2, 4, %1
%endmacro
%macro INTRA_SAD_8x8C 0
......
......@@ -29,6 +29,7 @@
SECTION .text
cextern pw_1
cextern pw_8
;=============================================================================
; SAD MMX
......@@ -469,3 +470,67 @@ SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3]);
;-----------------------------------------------------------------------------
;m0 = DC
;m6 = V
;m7 = H
;m1 = DC score
;m2 = V score
;m3 = H score
;m5 = temp
;m4 = pixel row
%macro INTRA_SAD_HVDC_ITER 2
mova m4, [r0+(%1-4)*FENC_STRIDEB]
psubw m4, m0
ABSW m4, m4, m5
ACCUM paddw, 1, 4, %1
mova m4, [r0+(%1-4)*FENC_STRIDEB]
psubw m4, m6
ABSW m4, m4, m5
ACCUM paddw, 2, 4, %1
pshufd m5, m7, %2
psubw m5, [r0+(%1-4)*FENC_STRIDEB]
ABSW m5, m5, m4
ACCUM paddw, 3, 5, %1
%endmacro
%macro INTRA_SAD_X3_8x8 0
cglobal intra_sad_x3_8x8, 3,3,8
add r0, 4*FENC_STRIDEB
movu m0, [r1+7*SIZEOF_PIXEL]
mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
mova m7, m0
paddw m0, m6
punpckhwd m7, m7
HADDW m0, m4
paddw m0, [pw_8]
psrlw m0, 4
SPLATW m0, m0
INTRA_SAD_HVDC_ITER 0, q3333
INTRA_SAD_HVDC_ITER 1, q2222
INTRA_SAD_HVDC_ITER 2, q1111
INTRA_SAD_HVDC_ITER 3, q0000
movq m7, [r1+7*SIZEOF_PIXEL]
punpcklwd m7, m7
INTRA_SAD_HVDC_ITER 4, q3333
INTRA_SAD_HVDC_ITER 5, q2222
INTRA_SAD_HVDC_ITER 6, q1111
INTRA_SAD_HVDC_ITER 7, q0000
HADDW m2, m4
HADDW m3, m4
HADDW m1, m4
movd [r2+0], m2
movd [r2+4], m3
movd [r2+8], m1
RET
%endmacro
INIT_XMM sse2
INTRA_SAD_X3_8x8
INIT_XMM ssse3
INTRA_SAD_X3_8x8
......@@ -272,7 +272,7 @@
paddd %1, %2
%endmacro
%macro HADDW 2
%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && mmsize == 16
vphaddwq %1, %1
movhlps %2, %1
......@@ -783,3 +783,12 @@
%rotate 1
%endrep
%endmacro
; instruction, accum, input, iteration (zero to swap, nonzero to add)
%macro ACCUM 4
%if %4
%1 m%2, m%3
%else
SWAP %2, %3
%endif
%endmacro
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment