Commit 68a6268b authored by Fiona Glaser's avatar Fiona Glaser
Browse files

x86: faster AVX satd/sa8d/sa8d_satd/hadamard_ac

Use Conroe-style movddup in AVX transforms; both Sandy Bridge and Bulldozer
do movddup in the load unit, so it's totally free this way.

On Sandy Bridge:
~6% faster sa8d_satd
~5% faster hadamard_ac
~9% faster 32-bit satd
~2% faster sa8d
parent 5d60b9c9
......@@ -1431,7 +1431,9 @@ cglobal pixel_satd_8x8_internal
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
ret
%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
......@@ -4032,6 +4034,9 @@ INTRA_X9
INTRA8_X9
%endif
; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
; it's effectively free.
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
INIT_XMM avx
SATDS_SSE2
SA8D
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment