Commit 50aaf8d8 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

Remove obsolete versions of intra_mbcmp_x3

intra_mbcmp_x3 is unnecessary if x9 exists (SSSE3 and onwards).
parent 1111780d
......@@ -1091,13 +1091,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......@@ -1127,7 +1121,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
}
if( cpu&X264_CPU_AVX )
......@@ -1149,17 +1142,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( ssd, _avx );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
}
if( cpu&X264_CPU_XOP )
......@@ -1175,9 +1163,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( ssd, _xop );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_xop;
#endif
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
......
......@@ -1635,12 +1635,14 @@ cglobal pixel_sa8d_16x16, 4,7
paddw %3, %5
%endmacro
; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8, 3,3,16
cglobal intra_sa8d_x3_8x8, 3,3,14
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
......@@ -1667,23 +1669,15 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
paddusw m8, m10
paddusw m9, m11
ABSW2 m10, m11, m6, m7, m6, m7
ABSW m15, m1, m1
ABSW m13, m1, m1
paddusw m10, m11
paddusw m8, m9
paddusw m15, m10
paddusw m15, m8
paddusw m13, m10
paddusw m13, m8
; 1D hadamard of edges
movq m8, [r1+7]
movq m9, [r1+16]
%if cpuflag(ssse3)
punpcklwd m8, m8
pshufb m9, [intrax3_shuf]
pmaddubsw m8, [pb_pppm]
pmaddubsw m9, [pb_pppm]
HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm]
HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm]
%else ; sse2
pxor m10, m10
punpcklbw m8, m10
punpcklbw m9, m10
......@@ -1697,7 +1691,6 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
pmullw m11, [pw_pmpmpmpm]
paddw m8, m10
paddw m9, m11
%endif
; differences
paddw m10, m8, m9
......@@ -1709,8 +1702,8 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
psubw m8, m0
psubw m10, m0
ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
paddusw m14, m8, m15
paddusw m15, m10
paddusw m8, m13
paddusw m13, m10
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
......@@ -1719,44 +1712,29 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
psllw m9, 3 ; top edge
psrldq m2, m15, 2 ; 8x7 sum
psrldq m2, m13, 2 ; 8x7 sum
psubw m0, m9 ; 8x1 sum
ABSW m0, m0, m9
paddusw m2, m0
; 3x HADDW
%if cpuflag(xop)
phaddw m2, m14
vphadduwq m0, m15
movhlps m1, m0
vphadduwq m2, m2 ; i8x8_v, i8x8_h
paddd m0, m1 ; i8x8_dc
packusdw m2, m0 ; i8x8_v, i8x8_h, i8x8_dc
pxor m3, m3
psrlw m2, 1
pavgw m2, m3
movq [r2], m2 ; i8x8_v, i8x8_h
psrldq m2, 8
movd [r2+8], m2 ; i8x8_dc
%else
movdqa m7, [pw_1]
pmaddwd m2, m7
pmaddwd m14, m7
pmaddwd m15, m7
punpckhdq m3, m2, m14
punpckldq m2, m14
pshufd m5, m15, q3311
pmaddwd m8, m7
pmaddwd m13, m7
punpckhdq m3, m2, m8
punpckldq m2, m8
pshufd m5, m13, q3311
paddd m2, m3
paddd m5, m15
punpckhqdq m3, m2, m5
paddd m5, m13
punpckhqdq m0, m2, m5
punpcklqdq m2, m5
pavgw m3, m2
pxor m0, m0
pavgw m3, m0
movq [r2], m3 ; i8x8_v, i8x8_h
psrldq m3, 8
movd [r2+8], m3 ; i8x8_dc
%endif
pavgw m0, m2
pxor m1, m1
pavgw m0, m1
movq [r2], m0 ; i8x8_v, i8x8_h
psrldq m0, 8
movd [r2+8], m0 ; i8x8_dc
RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
......@@ -3714,7 +3692,6 @@ INTRA8_X9
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX ssse3
INTRA_X3_MMX
%endif
......@@ -3734,7 +3711,6 @@ INIT_XMM avx
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INTRA_X9
INTRA8_X9
%endif
......@@ -3745,7 +3721,6 @@ INIT_XMM xop
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
......
......@@ -97,10 +97,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
......@@ -113,13 +110,8 @@ void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_xop ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
......
......@@ -29,19 +29,6 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
h8x8_pred_shuf: times 8 db 1
times 8 db 0
times 8 db 3
times 8 db 2
times 8 db 5
times 8 db 4
times 8 db 7
times 8 db 6
SECTION .text
cextern pb_3
......@@ -385,45 +372,6 @@ cglobal intra_sad_x3_4x4_mmx2, 3,3
movd [r2+4], mm1 ;H prediction cost
RET
%macro INTRA_SADx3_4x4 0
cglobal intra_sad_x3_4x4, 3,3
movd xmm4, [r1+FDEC_STRIDE*0-4]
pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
movd xmm2, [r1-FDEC_STRIDE]
pxor xmm3, xmm3
pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
punpckldq xmm2, xmm4 ; ABCDEFGH
psadbw xmm2, xmm3
movd xmm1, [r0+FENC_STRIDE*0]
pinsrd xmm1, [r0+FENC_STRIDE*1], 1
pinsrd xmm1, [r0+FENC_STRIDE*2], 2
pinsrd xmm1, [r0+FENC_STRIDE*3], 3
psadbw xmm0, xmm1
psadbw xmm5, xmm1
psraw xmm2, 2
pavgw xmm2, xmm3
pshufb xmm2, xmm3 ; DC prediction
punpckhqdq xmm3, xmm0, xmm5
punpcklqdq xmm0, xmm5
psadbw xmm2, xmm1
paddw xmm0, xmm3
movhlps xmm4, xmm2
packusdw xmm0, xmm0
paddw xmm2, xmm4
movq [r2], xmm0 ; V/H prediction costs
movd [r2+8], xmm2 ; DC prediction cost
RET
%endmacro ; INTRA_SADx3_4x4
INIT_XMM sse4
INTRA_SADx3_4x4
INIT_XMM avx
INTRA_SADx3_4x4
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
;-----------------------------------------------------------------------------
......@@ -491,69 +439,6 @@ cglobal intra_sad_x3_8x8_mmx2, 3,3
movd [r2+8], m1
RET
%macro INTRA_SADx3_8x8 0
cglobal intra_sad_x3_8x8, 3,4,9
%ifdef PIC
lea r11, [h8x8_pred_shuf]
%define shuf r11
%else
%define shuf h8x8_pred_shuf
%endif
movq m0, [r1+7] ; left pixels
movq m1, [r1+16] ; top pixels
pxor m2, m2
pxor m3, m3
psadbw m2, m0
psadbw m3, m1
paddw m2, m3
pxor m3, m3 ; V score accumulator
psraw m2, 3
pavgw m2, m3
punpcklqdq m1, m1 ; V prediction
pshufb m2, m3 ; DC prediction
pxor m4, m4 ; H score accumulator
pxor m5, m5 ; DC score accumulator
mov r3d, 6
.loop:
movq m6, [r0+FENC_STRIDE*0]
movhps m6, [r0+FENC_STRIDE*1]
pshufb m7, m0, [shuf+r3*8] ; H prediction
%ifdef ARCH_X86_64
psadbw m7, m6
psadbw m8, m1, m6
psadbw m6, m2
paddw m4, m7
paddw m3, m8
paddw m5, m6
%else
psadbw m7, m6
paddw m4, m7
psadbw m7, m1, m6
psadbw m6, m2
paddw m3, m7
paddw m5, m6
%endif
add r0, FENC_STRIDE*2
sub r3d, 2
jge .loop
movhlps m0, m3
movhlps m1, m4
movhlps m2, m5
paddw m3, m0
paddw m4, m1
paddw m5, m2
movd [r2+0], m3
movd [r2+4], m4
movd [r2+8], m5
RET
%endmacro ; INTRA_SADx3_8x8
INIT_XMM ssse3
INTRA_SADx3_8x8
INIT_XMM avx
INTRA_SADx3_8x8
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment