Commit aa47955a authored by Fiona Glaser's avatar Fiona Glaser

AVX 32-bit hpel_filter_h

Faster on Sandy Bridge.
Also add details on unsuccessful optimizations in these functions.
parent d7407cf8
......@@ -89,7 +89,7 @@ cextern hsub_mul
%macro IDCT8_1D 11
SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
psra%1 m%10, m%3, 1
padd%1 m%10, m%3
padd%1 m%10, m%5
......
......@@ -461,6 +461,7 @@ cglobal hpel_filter_c, 3,3,9
%else
%define tpw_32 [pw_32]
%endif
; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
%if cpuflag(misalign)
.loop:
movu m4, [src-4]
......@@ -559,11 +560,11 @@ cglobal hpel_filter_h_sse2, 3,3,8
jl .loop
REP_RET
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_ssse3, 3,3
%macro HPEL_H 0
cglobal hpel_filter_h, 3,3
add r0, r2
add r1, r2
neg r2
......@@ -573,6 +574,9 @@ cglobal hpel_filter_h_ssse3, 3,3
mova m7, [pw_16]
.loop:
mova m2, [src+16]
; Using unaligned loads instead of palignr is marginally slower on SB and significantly
; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
; the repeated loads of constants for pmaddubsw.
palignr m3, m1, m0, 14
palignr m4, m1, m0, 15
palignr m0, m2, m1, 2
......@@ -596,7 +600,7 @@ cglobal hpel_filter_h_ssse3, 3,3
add r2, 16
jl .loop
REP_RET
%endif ; !ARCH_X86_64
%endmacro
INIT_MMX mmx2
HPEL_V 0
......@@ -610,9 +614,11 @@ HPEL_C
INIT_XMM ssse3
HPEL_C
HPEL_V 0
HPEL_H
INIT_XMM avx
HPEL_C
HPEL_V 0
HPEL_H
%endif
%if ARCH_X86_64
......
......@@ -450,7 +450,7 @@ void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3)
HPEL(16, avx, avx, avx, ssse3)
HPEL(16, avx, avx, avx, avx)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment