Commit 65ee1233 authored by Kyle Siefring's avatar Kyle Siefring Committed by Ronald S. Bultje

Speed up finding the best cost in avx2 cdef

parent 4380bc33
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
%if ARCH_X86_64 %if ARCH_X86_64
SECTION_RODATA 32 SECTION_RODATA 32
pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3 pd_02564713: dd 0, 2, 5, 6, 4, 7, 1, 3
pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105 dd 420, 210, 140, 105
pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pw_128: times 2 dw 128 pw_128: times 2 dw 128
...@@ -640,54 +640,38 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -640,54 +640,38 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
pmulld m5, m13 pmulld m5, m13
paddd m5, m6 ; cost1[a-d] | cost3[a-d] paddd m5, m6 ; cost1[a-d] | cost3[a-d]
mova xm0, [pd_04512763+ 0] mova xm0, [pd_47130256+ 16]
mova xm1, [pd_04512763+ 16] mova m1, [pd_47130256]
phaddd m9, m8 phaddd m9, m8
phaddd m5, m4 phaddd m5, m4
phaddd m9, m5 phaddd m9, m5
vpermd m0, m9 ; cost[0/4/2/6] vpermd m0, m9 ; cost[0-3]
vpermd m1, m9 ; cost[1/5/3/7] vpermd m1, m9 ; cost[4-7] | cost[0-3]
; now find the best cost, its idx^4 complement, and its idx ; now find the best cost
pcmpgtd xm2, xm1, xm0 ; [1/5/3/7] > [0/4/2/6] pmaxsd xm2, xm0, xm1
pand xm3, xm2, xm1 pshufd xm3, xm2, q3232
pandn xm4, xm2, xm0 pmaxsd xm2, xm3
por xm3, xm4 ; higher 4 values pshufd xm3, xm2, q1111
pshufd xm1, xm1, q2301 pmaxsd xm2, xm3
pshufd xm0, xm0, q2301 pshufd xm2, xm2, q0000 ; best cost
pand xm1, xm2, xm1
pandn xm0, xm2, xm0 ; find the idx using minpos
por xm0, xm1 ; complementary 4 values at idx^4 offset ; make everything other than the best cost negative via subtraction
pand xm13, xm2, [pd_04261537+16] ; find the min of unsigned 16-bit ints to sort out the negative values
pandn xm14, xm2, [pd_04261537+ 0] psubd xm4, xm1, xm2
por xm14, xm13 ; indices psubd xm3, xm0, xm2
packssdw xm3, xm4
punpckhqdq xm4, xm3, xm0 phminposuw xm3, xm3
punpcklqdq xm3, xm0
pcmpgtd xm5, xm4, xm3 ; [2or3-6or7] > [0or1/4or5] ; convert idx to 32-bits
punpcklqdq xm5, xm5 psrldq xm3, 2
pand xm6, xm5, xm4 movd eax, xm3
pandn xm7, xm5, xm3
por xm6, xm7 ; { highest 2 values, complements at idx^4 } ; get idx^4 complement
movhlps xm13, xm14 vpermd m3, m1
pand xm13, xm5, xm13 psubd xm2, xm3
pandn xm14, xm5, xm14 psrld xm2, 10
por xm14, xm13 movd [varq], xm2
pshufd xm7, xm6, q3311
pcmpgtd xm8, xm7, xm6 ; [4or5or6or7] > [0or1or2or3]
punpcklqdq xm8, xm8
pand xm9, xm8, xm7
pandn xm10, xm8, xm6
por xm9, xm10 ; max
movhlps xm10, xm9 ; complement at idx^4
psubd xm9, xm10
psrld xm9, 10
movd [varq], xm9
pshufd xm13, xm14, q1111
pand xm13, xm8, xm13
pandn xm14, xm8, xm14
por xm14, xm13
movd eax, xm14
RET RET
%endif ; ARCH_X86_64 %endif ; ARCH_X86_64
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment