Commit c204da0f authored by Kyle Siefring's avatar Kyle Siefring Committed by Ronald S. Bultje

Use some 8 bit arithmetic in AVX2 CDEF filter

Before:
cdef_filter_8x8_8bpc_avx2: 252.3
cdef_filter_4x8_8bpc_avx2: 182.1
cdef_filter_4x4_8bpc_avx2: 105.7

After:
cdef_filter_8x8_8bpc_avx2: 235.5
cdef_filter_4x8_8bpc_avx2: 174.8
cdef_filter_4x4_8bpc_avx2: 101.8
parent ac4e679b
...@@ -34,9 +34,13 @@ div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 ...@@ -34,9 +34,13 @@ div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105 dd 420, 210, 140, 105
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_128: times 2 dw 128 pw_128: times 2 dw 128
pw_2048: times 2 dw 2048 pw_2048: times 2 dw 2048
tap_table: dw 4, 2, 3, 3, 2, 1 tap_table: ; masks for 8 bit shifts
db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
; weights
db 4, 2, 3, 3, 2, 1
db -1 * 16 + 1, -2 * 16 + 2 db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2
db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2
...@@ -55,29 +59,29 @@ tap_table: dw 4, 2, 3, 3, 2, 1 ...@@ -55,29 +59,29 @@ tap_table: dw 4, 2, 3, 3, 2, 1
SECTION .text SECTION .text
%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride %macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride
; load p0/p1 ; load p0/p1
movsx offq, byte [dirq+kq+%1] ; off1 movsx offq, byte [dirq+kq+%1] ; off1
%if %5 == 4 %if %6 == 4
movq xm5, [stkq+offq*2+%6*0] ; p0 movq xm5, [stkq+offq*2+%7*0] ; p0
movq xm6, [stkq+offq*2+%6*2] movq xm6, [stkq+offq*2+%7*2]
movhps xm5, [stkq+offq*2+%6*1] movhps xm5, [stkq+offq*2+%7*1]
movhps xm6, [stkq+offq*2+%6*3] movhps xm6, [stkq+offq*2+%7*3]
vinserti128 m5, xm6, 1 vinserti128 m5, xm6, 1
%else %else
movu xm5, [stkq+offq*2+%6*0] ; p0 movu xm5, [stkq+offq*2+%7*0] ; p0
vinserti128 m5, [stkq+offq*2+%6*1], 1 vinserti128 m5, [stkq+offq*2+%7*1], 1
%endif %endif
neg offq ; -off1 neg offq ; -off1
%if %5 == 4 %if %6 == 4
movq xm6, [stkq+offq*2+%6*0] ; p1 movq xm6, [stkq+offq*2+%7*0] ; p1
movq xm9, [stkq+offq*2+%6*2] movq xm9, [stkq+offq*2+%7*2]
movhps xm6, [stkq+offq*2+%6*1] movhps xm6, [stkq+offq*2+%7*1]
movhps xm9, [stkq+offq*2+%6*3] movhps xm9, [stkq+offq*2+%7*3]
vinserti128 m6, xm9, 1 vinserti128 m6, xm9, 1
%else %else
movu xm6, [stkq+offq*2+%6*0] ; p1 movu xm6, [stkq+offq*2+%7*0] ; p1
vinserti128 m6, [stkq+offq*2+%6*1], 1 vinserti128 m6, [stkq+offq*2+%7*1], 1
%endif %endif
; out of bounds values are set to a value that is a both a large unsigned ; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value. ; value and a negative signed value.
...@@ -88,24 +92,26 @@ SECTION .text ...@@ -88,24 +92,26 @@ SECTION .text
pminuw m8, m6 ; min after p1 pminuw m8, m6 ; min after p1
; accumulate sum[m15] over p0/p1 ; accumulate sum[m15] over p0/p1
; calculate difference before converting
psubw m5, m4 ; diff_p0(p0 - px) psubw m5, m4 ; diff_p0(p0 - px)
psubw m6, m4 ; diff_p1(p1 - px) psubw m6, m4 ; diff_p1(p1 - px)
pabsw m9, m5
pabsw m10, m6 ; convert to 8-bits with signed saturation
psignw m11, %4, m5 ; saturating to large diffs has no impact on the results
psignw m12, %4, m6 packsswb m5, m6
psrlw m5, m9, %2
psrlw m6, m10, %2 ; group into pairs so we can accumulate using maddubsw
psubusw m5, %3, m5 pshufb m5, m12
psubusw m6, %3, m6 pabsb m9, m5
psignb m10, %5, m5
; use unsigned min since abs diff can equal 0x8000 psrlw m5, m9, %2 ; emulate 8-bit shift
pminuw m5, m9 ; constrain(diff_p0) pand m5, %3
pminuw m6, m10 ; constrain(diff_p1) psubusb m5, %4, m5
pmullw m5, m11 ; constrain(diff_p0) * taps
pmullw m6, m12 ; constrain(diff_p1) * taps ; use unsigned min since abs diff can equal 0x80
pminub m5, m9
pmaddubsw m5, m10
paddw m15, m5 paddw m15, m5
paddw m15, m6
%endmacro %endmacro
%macro cdef_filter_fn 3 ; w, h, stride %macro cdef_filter_fn 3 ; w, h, stride
...@@ -359,6 +365,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -359,6 +365,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
INIT_YMM avx2 INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
%undef edged %undef edged
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
movifnidn prid, prim movifnidn prid, prim
movifnidn secd, secm movifnidn secd, secm
mov dampingd, r7m mov dampingd, r7m
...@@ -379,21 +388,25 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -379,21 +388,25 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
mov [rsp+0], pridmpq ; pri_shift mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total] ; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3 DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
movd xm0, prid movd xm0, prid
movd xm1, secd movd xm1, secd
vpbroadcastw m0, xm0 ; pri_strength vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastw m1, xm1 ; sec_strength vpbroadcastb m1, xm1 ; sec_strength
and prid, 1 and prid, 1
lea tapq, [tap_table] lea priq, [tableq+priq*2+8] ; pri_taps
lea priq, [tapq+priq*4] ; pri_taps lea secq, [tableq+12] ; sec_taps
lea secq, [tapq+8] ; sec_taps
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
mov dird, r6m mov dird, r6m
lea tapq, [tapq+dirq*2+12] lea dirq, [tapq+dirq*2+14]
%if %1*%2*2/mmsize > 1 %if %1*%2*2/mmsize > 1
%if %1 == 4 %if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
...@@ -405,7 +418,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -405,7 +418,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
%endif %endif
lea stkq, [px] lea stkq, [px]
pxor m13, m13 pxor m11, m11
%if %1*%2*2/mmsize > 1 %if %1*%2*2/mmsize > 1
.v_loop: .v_loop:
%endif %endif
...@@ -424,20 +437,20 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ...@@ -424,20 +437,20 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
mova m7, m4 ; max mova m7, m4 ; max
mova m8, m4 ; min mova m8, m4 ; min
.k_loop: .k_loop:
vpbroadcastw m2, [priq+kq*2] ; pri_taps vpbroadcastb m2, [priq+kq] ; pri_taps
vpbroadcastw m3, [secq+kq*2] ; sec_taps vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3
ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3
ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3
dec kq dec kq
jge .k_loop jge .k_loop
vpbroadcastd m12, [pw_2048] vpbroadcastd m10, [pw_2048]
pcmpgtw m11, m13, m15 pcmpgtw m9, m11, m15
paddw m15, m11 paddw m15, m9
pmulhrsw m15, m12 pmulhrsw m15, m10
paddw m4, m15 paddw m4, m15
pminsw m4, m7 pminsw m4, m7
pmaxsw m4, m8 pmaxsw m4, m8
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment