x86: optimize AVX2 cdef_dir

This optimization is so tiny we can't even see it in checkasm.
The only actual difference being the removal of a memory load, it has to
be better.
parent 14dc2038
...@@ -33,7 +33,6 @@ pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 ...@@ -33,7 +33,6 @@ pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105 dd 420, 210, 140, 105
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_128: times 2 dw 128 pw_128: times 2 dw 128
pw_2048: times 2 dw 2048 pw_2048: times 2 dw 2048
...@@ -600,9 +599,8 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -600,9 +599,8 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; and [upper half]: ; and [upper half]:
; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
vbroadcasti128 m14, [shufw_210xxxxx]
pslldq m4, m11, 2 pslldq m4, m11, 2
psrldq m11, 14 psrldq m11, 14
pslldq m5, m12, 4 pslldq m5, m12, 4
...@@ -616,7 +614,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -616,7 +614,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m11, m13 ; partial_sum_alt[3/2] right paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32] vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left paddw m4, m5 ; partial_sum_alt[3/2] left
pshufb m11, m14 pshuflw m11, m11, q3012
punpckhwd m6, m4, m11 punpckhwd m6, m4, m11
punpcklwd m4, m11 punpcklwd m4, m11
pmaddwd m6, m6 pmaddwd m6, m6
...@@ -631,7 +629,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -631,7 +629,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; and [upper half]: ; and [upper half]:
; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
pslldq m5, m1, 2 pslldq m5, m1, 2
psrldq m1, 14 psrldq m1, 14
...@@ -644,7 +642,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ...@@ -644,7 +642,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m6, m7 paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left paddw m5, m6 ; partial_sum_alt[0/1] left
pshufb m1, m14 pshuflw m1, m1, q3012
punpckhwd m6, m5, m1 punpckhwd m6, m5, m1
punpcklwd m5, m1 punpcklwd m5, m1
pmaddwd m6, m6 pmaddwd m6, m6
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment