Commit f7043e47 authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Add 10/12-bit deblock SSSE3 implementation

Currently 64-bit only.
parent c187e704
......@@ -224,6 +224,7 @@ if is_asm_enabled
'x86/mc16_avx2.asm',
'x86/cdef16_sse.asm',
'x86/itx16_sse.asm',
'x86/loopfilter16_sse.asm',
'x86/looprestoration16_sse.asm',
'x86/mc16_sse.asm',
)
......
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 16
pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
times 4 db 8, 9
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_3: times 8 dw 3
; 4 and 16 need to be next to each other since they are used as alternates
; depending on whether bitdepth is 10 or 12
pw_4: times 8 dw 4
pw_16: times 8 dw 16
pw_8: times 8 dw 8
pw_4096: times 8 dw 4096
pb_mask: dd 1, 1, 2, 2
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
%macro SPLATD 2
movd %1, %2
pshufd %1, %1, q0000
%endmacro
%macro SPLATW 2
movd %1, %2
pshuflw %1, %1, q0000
punpcklqdq %1, %1
%endmacro
; in: out:
; mm%1 a b c d a e i m
; mm%2 e f g h b f j n
; mm%3 i j k l -> c g k o
; mm%4 m n o p d h l p
%macro TRANSPOSE4X4W 5
punpcklwd m%5, m%1, m%2
punpckhwd m%1, m%2
punpcklwd m%2, m%3, m%4
punpckhwd m%3, m%4
punpckldq m%4, m%5, m%2
punpckhdq m%5, m%2
punpckldq m%2, m%1, m%3
punpckhdq m%1, m%3
SWAP %1, %4
SWAP %2, %5, %3
%endmacro
; in: out:
; xmm%1 a b c d e f g h a i q y 6 E M U
; xmm%2 i j k l m n o p b j r z 7 F N V
; xmm%3 q r s t u v w x c k s 0 8 G O W
; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
; xmm%6 E F G H I J K L f n v 3 B J R Z
; xmm%7 M N O P Q R S T g o w 4 C K S +
; xmm%8 U V W X Y Z + = h p x 5 D L T =
%macro TRANSPOSE8X8W 9
; xmm%1 a b c d e f g h a i q y b j r z
; xmm%2 i j k l m n o p c k s 0 d l t 1
; xmm%3 q r s t u v w x -> e m u 2 f n v 3
; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
TRANSPOSE4X4W %1, %2, %3, %4, %9
; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V
; xmm%6 E F G H I J K L 8 G O W 9 H P X
; xmm%7 M N O P Q R S T -> A I Q Y B J R Z
; xmm%8 U V W X Y Z + = C K S + D L T =
TRANSPOSE4X4W %5, %6, %7, %8, %9
; xmm%1 a i q y b j r z a i q y 6 E M U
; xmm%2 c k s 0 d l t 1 b j r z 7 F N V
; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W
; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X
; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y
; xmm%6 8 G O W 9 H P X f n v 3 B J R Z
; xmm%7 A I Q Y B J R Z g o w 4 C K S +
; xmm%8 C K S + D L T = h p x 5 D L T =
punpckhqdq m%9, m%1, m%5
punpcklqdq m%1, m%5
punpckhqdq m%5, m%2, m%6
punpcklqdq m%2, m%6
punpckhqdq m%6, m%3, m%7
punpcklqdq m%3, m%7
punpckhqdq m%7, m%4, m%8
punpcklqdq m%4, m%8
SWAP %8, %7, %4, %5, %3, %2, %9
%endmacro
; transpose and write m3-6, everything else is scratch
%macro TRANSPOSE_8x4_AND_WRITE_4x8 0
; transpose 8x4
punpcklwd m0, m3, m4
punpckhwd m3, m4
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpckldq m6, m0, m4
punpckhdq m0, m4
punpckldq m4, m3, m5
punpckhdq m3, m5
; write out
movq [dstq+strideq*0-4], xm6
movhps [dstq+strideq*1-4], xm6
movq [dstq+strideq*2-4], xm0
movhps [dstq+stride3q -4], xm0
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0-4], xm4
movhps [dstq+strideq*1-4], xm4
movq [dstq+strideq*2-4], xm3
movhps [dstq+stride3q -4], xm3
lea dstq, [dstq+strideq*4]
%endmacro
%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
; load data
%ifidn %2, v
%if %1 == 4
lea tmpq, [dstq+mstrideq*2]
mova m3, [tmpq+strideq*0] ; p1
mova m4, [tmpq+strideq*1] ; p0
mova m5, [tmpq+strideq*2] ; q0
mova m6, [tmpq+stride3q] ; q1
%else
; load 6-8 pixels, remainder (for wd=16) will be read inline
lea tmpq, [dstq+mstrideq*4]
; we load p3 later
mova m13, [tmpq+strideq*1]
mova m3, [tmpq+strideq*2]
mova m4, [tmpq+stride3q]
mova m5, [dstq+strideq*0]
mova m6, [dstq+strideq*1]
mova m14, [dstq+strideq*2]
%if %1 != 6
mova m15, [dstq+stride3q]
%endif
%endif
%else
; load lines
%if %1 == 4
movq xm3, [dstq+strideq*0-4]
movq xm4, [dstq+strideq*1-4]
movq xm5, [dstq+strideq*2-4]
movq xm6, [dstq+stride3q -4]
lea tmpq, [dstq+strideq*4]
movq xm11, [tmpq+strideq*0-4]
movq xm13, [tmpq+strideq*1-4]
movq xm14, [tmpq+strideq*2-4]
movq xm15, [tmpq+stride3q -4]
; transpose 4x8
; xm3: A-D0,A-D4
; xm4: A-D1,A-D5
; xm5: A-D2,A-D6
; xm6: A-D3,A-D7
punpcklwd m7, m3, m4
punpcklwd m3, m11, m13
punpcklwd m4, m5, m6
punpcklwd m5, m14, m15
; xm7: A0-1,B0-1,C0-1,D0-1
; xm3: A4-5,B4-5,C4-5,D4-5
; xm4: A2-3,B2-3,C2-3,D2-3
; xm5: A6-7,B6-7,C6-7,D6-7
punpckldq m6, m7, m4
punpckhdq m7, m4
punpckldq m8, m3, m5
punpckhdq m3, m5
SWAP 3, 5
; xm6: A0-3,B0-3
; xm7: C0-3,D0-3
; xm8: A4-7,B4-7
; xm5: C4-7,D4-7
punpcklqdq m3, m6, m8
punpckhqdq m4, m6, m8
punpckhqdq m6, m7, m5
punpcklqdq m7, m5
SWAP 7, 5
; xm3: A0-7
; xm4: B0-7
; xm5: C0-7
; xm6: D0-7
%elif %1 == 6 || %1 == 8
movu xm3, [dstq+strideq*0-8]
movu xm4, [dstq+strideq*1-8]
movu xm5, [dstq+strideq*2-8]
movu xm6, [dstq+stride3q -8]
lea tmpq, [dstq+strideq*4]
movu xm11, [tmpq+strideq*0-8]
movu xm13, [tmpq+strideq*1-8]
movu xm14, [tmpq+strideq*2-8]
movu xm15, [tmpq+stride3q -8]
; transpose 8x16
; xm3: A-H0,A-H8
; xm4: A-H1,A-H9
; xm5: A-H2,A-H10
; xm6: A-H3,A-H11
; xm11: A-H4,A-H12
; xm13: A-H5,A-H13
; xm14: A-H6,A-H14
; xm15: A-H7,A-H15
punpcklwd m7, m3, m4
punpckhwd m3, m4
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpcklwd m6, m11, m13
punpckhwd m11, m13
punpcklwd m13, m14, m15
punpckhwd m14, m15
; xm7: A0-1,B0-1,C0-1,D0-1
; xm3: E0-1,F0-1,G0-1,H0-1
; xm4: A2-3,B2-3,C2-3,D2-3
; xm5: E2-3,F2-3,G2-3,H2-3
; xm6: A4-5,B4-5,C4-5,D4-5
; xm11: E4-5,F4-5,G4-5,H4-5
; xm13: A6-7,B6-7,C6-7,D6-7
; xm14: E6-7,F6-7,G6-7,H6-7
punpckldq m15, m7, m4
punpckhdq m7, m4
punpckldq m9, m3, m5
punpckhdq m8, m3, m5
punpckldq m3, m6, m13
punpckhdq m6, m13
punpckldq m10, m11, m14
punpckhdq m11, m14
; xm15: A0-3,B0-3
; xm7: C0-3,D0-3
; xm9: E0-3,F0-3
; xm8: G0-3,H0-3
; xm3: A4-7,B4-7
; xm6: C4-7,D4-7
; xm10: E4-7,F4-7
; xm11: G4-7,H4-7
%if %1 != 6
punpcklqdq m0, m15, m3
%endif
punpckhqdq m13, m15, m3
punpcklqdq m3, m7, m6
punpckhqdq m4, m7, m6
punpcklqdq m5, m9, m10
punpckhqdq m6, m9, m10
punpcklqdq m14, m8, m11
%if %1 != 6
punpckhqdq m15, m8, m11
mova [rsp+5*32], m0
%endif
%else
; We only use 14 pixels but we'll need the remainder at the end for
; the second transpose
mova xm0, [dstq+strideq*0-16]
mova xm1, [dstq+strideq*1-16]
mova xm2, [dstq+strideq*2-16]
mova xm3, [dstq+stride3q -16]
lea tmpq, [dstq+strideq*4]
mova xm4, [tmpq+strideq*0-16]
mova xm5, [tmpq+strideq*1-16]
mova xm6, [tmpq+strideq*2-16]
mova xm7, [tmpq+stride3q -16]
TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
mova [rsp+6*32], m0
mova [rsp+7*32], m1
mova [rsp+8*32], m2
mova [rsp+9*32], m3
mova [rsp+5*32], m4
mova xm0, [dstq+strideq*0]
mova xm1, [dstq+strideq*1]
mova xm2, [dstq+strideq*2]
mova xm3, [dstq+stride3q ]
lea tmpq, [dstq+strideq*4]
mova xm8, [tmpq+strideq*0]
mova xm9, [tmpq+strideq*1]
mova xm10, [tmpq+strideq*2]
mova xm11, [tmpq+stride3q ]
TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
mova [rsp+10*32], m8
mova [rsp+11*32], m9
mova [rsp+12*32], m10
mova [rsp+13*32], m11
; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
SWAP 13, 5, 0
SWAP 3, 6, 1, 15
SWAP 4, 7
SWAP 2, 14
%endif
%endif
; load L/E/I/H
%ifidn %2, v
%if cpuflag(sse4)
pmovzxbw m1, [lq]
pmovzxbw m0, [lq+l_strideq]
pxor m2, m2
%else
movq m1, [lq]
movq m0, [lq+l_strideq]
pxor m2, m2
REPX {punpcklbw x, m2}, m1, m0
%endif
%else
movq m0, [lq] ; l0, l1
movq m1, [lq+l_strideq] ; l2, l3
punpckldq m0, m1 ; l0, l2, l1, l3
pxor m2, m2
punpcklbw m1, m0, m2 ; l0, l2
punpckhbw m0, m2 ; l1, l3
%endif
pcmpeqw m10, m2, m0
pand m1, m10
por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
pcmpeqw m10, m2, m0 ; !L
psrlw m10, 1
psrlw m2, m0, [lutq+128]
SPLATW m1, [lutq+136]
pminsw m2, m1
pmaxsw m2, [pw_1] ; I
psrlw m1, m0, 4 ; H
paddw m0, [pw_2]
paddw m0, m0
paddw m0, m2 ; E
REPX {pmullw x, [r11]}, m0, m1, m2
psubw m8, m3, m4 ; p1-p0
psubw m9, m5, m6 ; q1-q0
REPX {pabsw x, x}, m8, m9
pmaxsw m8, m10
pmaxsw m8, m9
pcmpgtw m7, m8, m1 ; hev
%if %1 != 4
psubw m9, m13, m4 ; p2-p0
pabsw m9, m9
pmaxsw m9, m8
%if %1 != 6
%ifidn %2, v
mova m11, [tmpq+strideq*0] ; p3
%else
mova m11, [rsp+5*32] ; p3
%endif
psubw m10, m11, m4 ; p3-p0
pabsw m10, m10
pmaxsw m9, m10
%endif
psubw m10, m5, m14 ; q2-q0
pabsw m10, m10
pmaxsw m9, m10
%if %1 != 6
psubw m10, m5, m15 ; q3-q0
pabsw m10, m10
pmaxsw m9, m10
%endif
pcmpgtw m9, [r11] ; !flat8in
psubw m10, m13, m3 ; p2-p1
pabsw m10, m10
%if %1 != 6
psubw m11, m13 ; p3-p2
pabsw m11, m11
pmaxsw m10, m11
psubw m11, m14, m15 ; q3-q2
pabsw m11, m11
pmaxsw m10, m11
%endif
psubw m11, m14, m6 ; q2-q1
pabsw m11, m11
pmaxsw m10, m11
%if %1 == 16
SPLATD m11, [maskq+8]
SPLATD m1, [maskq+4]
por m11, m1
pand m11, m12
pcmpeqd m11, m12
pand m10, m11
%else
SPLATD m11, [maskq+4]
pand m11, m12
pcmpeqd m11, m12
pand m10, m11 ; only apply fm-wide to wd>4 blocks
%endif
pmaxsw m8, m10
%endif
pcmpgtw m8, m2
psubw m10, m3, m6 ; p1-q1
psubw m11, m4, m5 ; p0-q0
REPX {pabsw x, x}, m10, m11
paddw m11, m11
psrlw m10, 1
paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
por m8, m10
%if %1 == 16
%ifidn %2, v
lea tmpq, [dstq+mstrideq*8]
mova m0, [tmpq+strideq*1]
mova m1, [tmpq+strideq*2]
mova m2, [tmpq+stride3q]
%else
mova m0, [rsp+7*32]
mova m1, [rsp+8*32]
mova m2, [rsp+9*32]
%endif
REPX {psubw x, m4}, m0, m1, m2
REPX {pabsw x, x}, m0, m1, m2
pmaxsw m1, m0
pmaxsw m1, m2
%ifidn %2, v
lea tmpq, [dstq+strideq*4]
mova m0, [tmpq+strideq*0]
mova m2, [tmpq+strideq*1]
mova m10, [tmpq+strideq*2]
%else
mova m0, [rsp+10*32]
mova m2, [rsp+11*32]
mova m10, [rsp+12*32]
%endif
REPX {psubw x, m5}, m0, m2, m10
REPX {pabsw x, x}, m0, m2, m10
pmaxsw m0, m2
pmaxsw m1, m10
pmaxsw m1, m0
pcmpgtw m1, [r11] ; !flat8out
por m1, m9 ; !flat8in | !flat8out
SPLATD m2, [maskq+8]
pand m10, m2, m12
pcmpeqd m10, m12
pandn m1, m10 ; flat16
pandn m10, m8, m1 ; flat16 & fm
SWAP 1, 10
SPLATD m10, [maskq+4]
por m10, m2
pand m2, m10, m12
pcmpeqd m2, m12
pandn m9, m2 ; flat8in
pandn m2, m8, m9
SWAP 2, 9
SPLATD m2, [maskq+0]
por m2, m10
pand m2, m12
pcmpeqd m2, m12
pandn m8, m2
pandn m0, m9, m8 ; fm & !flat8 & !flat16
SWAP 0, 8
pandn m0, m1, m9 ; flat8 & !flat16
SWAP 0, 9
%elif %1 != 4
SPLATD m0, [maskq+4]
pand m2, m0, m12
pcmpeqd m2, m12
pandn m9, m2
pandn m2, m8, m9 ; flat8 & fm
SWAP 2, 9
SPLATD m2, [maskq+0]
por m0, m2
pand m0, m12
pcmpeqd m0, m12
pandn m8, m0
pandn m0, m9, m8 ; fm & !flat8
SWAP 0, 8
%else
SPLATD m0, [maskq+0]
pand m0, m12
pcmpeqd m0, m12
pandn m8, m0 ; fm
%endif
; short filter
SPLATW m0, r7m
pcmpeqw m2, m2
psrlw m0, 1 ; 511 or 2047
pxor m2, m0 ; -512 or -2048
psubw m10, m5, m4
paddw m11, m10, m10
paddw m11, m10
psubw m10, m3, m6 ; iclip_diff(p1-q1)
pminsw m10, m0
pmaxsw m10, m2
pand m10, m7 ; f=iclip_diff(p1-q1)&hev
paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
pminsw m10, m0
pmaxsw m10, m2
pand m8, m10 ; f&=fm
paddw m10, m8, [pw_3]
paddw m8, [pw_4]
REPX {pminsw x, m0}, m10, m8
psraw m10, 3 ; f2
psraw m8, 3 ; f1
paddw m4, m10
psubw m5, m8
paddw m8, [pw_1]
psraw m8, 1 ; f=(f1+1)>>1
pandn m7, m8 ; f&=!hev
SWAP 7, 8
paddw m3, m8
psubw m6, m8
pxor m8, m8
psubw m0, m2 ; 1023 or 4095
REPX {pminsw x, m0}, m3, m4, m5, m6
REPX {pmaxsw x, m8}, m3, m4, m5, m6
%if %1 == 16
; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
; m12=filter bits mask
; m13-15=p2/q2/q3
; m0,2,7-8,10-11 = free
; flat16 filter
%ifidn %2, v
lea tmpq, [dstq+mstrideq*8]
mova m0, [tmpq+strideq*1] ; p6
mova m2, [tmpq+strideq*2] ; p5
mova m7, [tmpq+stride3q] ; p4
mova m11, [tmpq+strideq*4] ; p3
%else
mova m0, [rsp+7*32]
mova m2, [rsp+8*32]
mova m7, [rsp+9*32]
mova m11, [rsp+5*32]
%endif
mova [rsp+ 0*32], m9
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
psllw m8, m0, 3 ; p6*8
paddw m8, [pw_8]
paddw m10, m2, m7 ; p5+p4
psubw m8, m0
paddw m10, m10 ; (p5+p4)*2
paddw m8, m11 ; p6*7+p3
paddw m10, m13 ; (p5+p4)*2+p2
paddw m8, m3 ; p6*7+p3+p1
paddw m10, m4 ; (p5+p4)*2+p2+p0
paddw m8, m5 ; p6*7+p3+p1+q0
paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m2
por m10, m9
%ifidn %2, v
mova [tmpq+strideq*2], m10 ; p5
%else
mova [rsp+8*32], m10
%endif
; sub p6*2, add p3/q1
paddw m8, m11
paddw m10, m0, m0
paddw m8, m6