Commit 08020016 authored by Henrik Gramner's avatar Henrik Gramner Committed by Henrik Gramner

x86: Add minor CDEF AVX2 optimizations

parent 585ac462
...@@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
lea dst4q, [dstq+strideq*4] lea dst4q, [dstq+strideq*4]
%endif %endif
lea stride3q, [strideq*3] lea stride3q, [strideq*3]
test edged, 2 ; have_right test edgeb, 2 ; have_right
jz .no_right jz .no_right
pmovzxbw m1, [dstq+strideq*0] pmovzxbw m1, [dstq+strideq*0]
pmovzxbw m2, [dstq+strideq*1] pmovzxbw m2, [dstq+strideq*1]
...@@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; top ; top
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
test edged, 4 ; have_top test edgeb, 4 ; have_top
jz .no_top jz .no_top
mov top1q, [top2q+0*gprsize] mov top1q, [top2q+0*gprsize]
mov top2q, [top2q+1*gprsize] mov top2q, [top2q+1*gprsize]
test edged, 1 ; have_left test edgeb, 1 ; have_left
jz .top_no_left jz .top_no_left
test edged, 2 ; have_right test edgeb, 2 ; have_right
jz .top_no_right jz .top_no_right
pmovzxbw m1, [top1q-(%1/2)] pmovzxbw m1, [top1q-(%1/2)]
pmovzxbw m2, [top2q-(%1/2)] pmovzxbw m2, [top2q-(%1/2)]
...@@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px-1*%3+%1*2], xm14 movd [px-1*%3+%1*2], xm14
jmp .top_done jmp .top_done
.top_no_left: .top_no_left:
test edged, 2 ; have_right test edgeb, 2 ; have_right
jz .top_no_left_right jz .top_no_left_right
pmovzxbw m1, [top1q] pmovzxbw m1, [top1q]
pmovzxbw m2, [top2q] pmovzxbw m2, [top2q]
...@@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
.top_done: .top_done:
; left ; left
test edged, 1 ; have_left test edgeb, 1 ; have_left
jz .no_left jz .no_left
pmovzxbw xm1, [leftq+ 0] pmovzxbw xm1, [leftq+ 0]
%if %2 == 8 %if %2 == 8
...@@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; bottom ; bottom
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
test edged, 8 ; have_bottom test edgeb, 8 ; have_bottom
jz .no_bottom jz .no_bottom
lea dst8q, [dstq+%2*strideq] lea dst8q, [dstq+%2*strideq]
test edged, 1 ; have_left test edgeb, 1 ; have_left
jz .bottom_no_left jz .bottom_no_left
test edged, 2 ; have_right test edgeb, 2 ; have_right
jz .bottom_no_right jz .bottom_no_right
pmovzxbw m1, [dst8q-(%1/2)] pmovzxbw m1, [dst8q-(%1/2)]
pmovzxbw m2, [dst8q+strideq-(%1/2)] pmovzxbw m2, [dst8q+strideq-(%1/2)]
...@@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px+(%2+1)*%3+%1*2], xm14 movd [px+(%2+1)*%3+%1*2], xm14
jmp .bottom_done jmp .bottom_done
.bottom_no_left: .bottom_no_left:
test edged, 2 ; have_right test edgeb, 2 ; have_right
jz .bottom_no_left_right jz .bottom_no_left_right
pmovzxbw m1, [dst8q] pmovzxbw m1, [dst8q]
pmovzxbw m2, [dst8q+strideq] pmovzxbw m2, [dst8q+strideq]
...@@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; actual filter ; actual filter
INIT_YMM avx2 INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
%undef edged %undef edged
; register to shuffle values into after packing ; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi] vbroadcasti128 m12, [shufb_lohi]
movifnidn prid, prim movifnidn prid, prim
movifnidn secd, secm
mov dampingd, r7m mov dampingd, r7m
lzcnt pridmpd, prid
mov pridmpd, prid %if UNIX64
mov secdmpd, secd movd xm0, prid
or pridmpd, 1 movd xm1, secdmpd
or secdmpd, 1 %endif
lzcnt pridmpd, pridmpd lzcnt secdmpd, secdmpm
lzcnt secdmpd, secdmpd sub dampingd, 31
lea pridmpd, [pridmpd+dampingd-31] xor zerod, zerod
lea secdmpd, [secdmpd+dampingd-31] add pridmpd, dampingd
xor dampingd, dampingd cmovl pridmpd, zerod
test pridmpd, pridmpd add secdmpd, dampingd
cmovl pridmpd, dampingd cmovl secdmpd, zerod
test secdmpd, secdmpd
cmovl secdmpd, dampingd
mov [rsp+0], pridmpq ; pri_shift mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
lea tableq, [tap_table] lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total] ; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3 DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
movd xm0, prid %if UNIX64
movd xm1, secd
vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength vpbroadcastb m1, xm1 ; sec_strength
%else
vpbroadcastb m0, prim
vpbroadcastb m1, secm
%endif
and prid, 1 and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps lea secq, [tableq+12] ; sec_taps
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
mov dird, r6m mov dird, r6m
lea dirq, [tapq+dirq*2+14] lea dirq, [tableq+dirq*2+14]
%if %1*%2*2/mmsize > 1 %if %1*%2*2/mmsize > 1
%if %1 == 4 %if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
...@@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m11, m13 ; partial_sum_alt[3/2] right paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32] vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left paddw m4, m5 ; partial_sum_alt[3/2] left
pshuflw m11, m11, q3012 pshuflw m5, m11, q3012
punpckhwd m6, m4, m11 punpckhwd m6, m11, m4
punpcklwd m4, m11 punpcklwd m4, m5
pmaddwd m6, m6 pmaddwd m6, m6
pmaddwd m4, m4 pmaddwd m4, m4
pmulld m6, m12 pmulld m6, m12
...@@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m6, m7 paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left paddw m5, m6 ; partial_sum_alt[0/1] left
pshuflw m1, m1, q3012 pshuflw m0, m1, q3012
punpckhwd m6, m5, m1 punpckhwd m1, m5
punpcklwd m5, m1 punpcklwd m5, m0
pmaddwd m6, m6 pmaddwd m1, m1
pmaddwd m5, m5 pmaddwd m5, m5
pmulld m6, m12 pmulld m1, m12
pmulld m5, m13 pmulld m5, m13
paddd m5, m6 ; cost1[a-d] | cost3[a-d] paddd m5, m1 ; cost1[a-d] | cost3[a-d]
mova xm0, [pd_47130256+ 16] mova xm0, [pd_47130256+ 16]
mova m1, [pd_47130256] mova m1, [pd_47130256]
...@@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; now find the best cost ; now find the best cost
pmaxsd xm2, xm0, xm1 pmaxsd xm2, xm0, xm1
pshufd xm3, xm2, q3232 pshufd xm3, xm2, q1032
pmaxsd xm2, xm3
pshufd xm3, xm2, q1111
pmaxsd xm2, xm3 pmaxsd xm2, xm3
pshufd xm2, xm2, q0000 ; best cost pshufd xm3, xm2, q2301
pmaxsd xm2, xm3 ; best cost
; find the idx using minpos ; find the idx using minpos
; make everything other than the best cost negative via subtraction ; make everything other than the best cost negative via subtraction
...@@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
phminposuw xm3, xm3 phminposuw xm3, xm3
; convert idx to 32-bits ; convert idx to 32-bits
psrldq xm3, 2 psrld xm3, 16
movd eax, xm3 movd eax, xm3
; get idx^4 complement ; get idx^4 complement
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment