Commit 084ec98b authored by Xuefeng Jiang's avatar Xuefeng Jiang

optimise

parent 4c7ce1e9
Pipeline #6091 passed with stages
in 6 minutes and 23 seconds
......@@ -2749,17 +2749,22 @@ cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
psubusb m2, m5, m3
psubusb m0, m3, m5
por m2, m0 ; tdiff
%ifnum %2
pminub m2, m%2
pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
mova [rsp], m3
BLEND m0, m3, m%1
mova m3, [rsp]
%else
mova m0, %2
pminub m2, m0
pcmpeqb m0, m2
%endif
pminub m1, m2
pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
BLEND m1, m0, m5
mova m2, m3
BLEND m0, m2, m%1
BLEND m1, m0, m5
%endmacro
cglobal ipred_paeth, 3, 6, 8, -9*16, dst, stride, tl, w, h
cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
%define base r5-ipred_paeth_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
......@@ -2768,8 +2773,7 @@ cglobal ipred_paeth, 3, 6, 8, -9*16, dst, stride, tl, w, h
pshufb m5, m0
LEA r5, ipred_paeth_ssse3_table
movsxd wq, [r5+wq*4]
movd m4, [base+ipred_paeth_shuf]
pshufd m4, m4, q0000
movddup m4, [base+ipred_paeth_shuf]
add wq, r5
jmp wq
.w4:
......@@ -2795,12 +2799,10 @@ cglobal ipred_paeth, 3, 6, 8, -9*16, dst, stride, tl, w, h
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
movu m6, [tlq+1]
punpcklqdq m6, m6
movddup m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
......@@ -2838,25 +2840,22 @@ ALIGN function_align
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+16], m6
mova [rsp+32], m7
mova [rsp ], m6
mova [rsp+16], m7
movu m6, [tlq+17]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+48], m6
mova [rsp+64], m7
mova [rsp+32], m6
.w32_loop:
dec tlq
movd m3, [tlq]
pxor m1, m1
pshufb m3, m1
mova m6, [rsp+16]
mova m7, [rsp+32]
PAETH 6, 7
mova [dstq], m1
mova m6, [rsp+48]
mova m7, [rsp+64]
mova m6, [rsp]
PAETH 6, [rsp+16]
mova [dstq ], m1
mova m6, [rsp+32]
PAETH 6, 7
mova [dstq+16], m1
add dstq, strideq
......@@ -2869,45 +2868,40 @@ ALIGN function_align
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+16], m6
mova [rsp+32], m7
mova [rsp ], m6
mova [rsp+16], m7
movu m6, [tlq+17]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+48], m6
mova [rsp+64], m7
mova [rsp+32], m6
mova [rsp+48], m7
movu m6, [tlq+33]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+80], m6
mova [rsp+96], m7
mova [rsp+64], m6
mova [rsp+80], m7
movu m6, [tlq+49]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+112], m6
mova [rsp+128], m7
mova [rsp+96], m6
.w64_loop:
dec tlq
movd m3, [tlq]
pxor m1, m1
pshufb m3, m1
mova m6, [rsp+16]
mova m7, [rsp+32]
PAETH 6, 7
mova [dstq], m1
mova m6, [rsp+48]
mova m7, [rsp+64]
PAETH 6, 7
mova m6, [rsp]
PAETH 6, [rsp+16]
mova [dstq ], m1
mova m6, [rsp+32]
PAETH 6, [rsp+48]
mova [dstq+16], m1
mova m6, [rsp+80]
mova m7, [rsp+96]
PAETH 6, 7
mova m6, [rsp+64]
PAETH 6, [rsp+80]
mova [dstq+32], m1
mova m6, [rsp+112]
mova m7, [rsp+128]
mova m6, [rsp+96]
PAETH 6, 7
mova [dstq+48], m1
add dstq, strideq
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment