Commit 8c5d34c8 authored by Henrik Gramner's avatar Henrik Gramner

Add tail call optimizations in SSSE3 itx

parent 1703f21f
Pipeline #3765 passed with stages
in 5 minutes and 18 seconds
...@@ -682,7 +682,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ...@@ -682,7 +682,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
%macro TAIL_CALL 2 ; callee, is_nonadjacent %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
%if has_epilogue %if has_epilogue
call %1 call %1
RET RET
......
...@@ -198,9 +198,11 @@ SECTION .text ...@@ -198,9 +198,11 @@ SECTION .text
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack %macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2 cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
%undef cmp %undef cmp
%define %%p1 m(i%1_%4_internal)
%if ARCH_X86_32 %if ARCH_X86_32
LEA r5, $$ LEA r5, $$
%endif %endif
%if has_epilogue
%if %3 > 0 %if %3 > 0
cmp eobd, %3 cmp eobd, %3
jle %%end jle %%end
...@@ -209,10 +211,23 @@ cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2 ...@@ -209,10 +211,23 @@ cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
jz %%end jz %%end
%endif %endif
lea tx2q, [o(m(i%2_%4_internal).pass2)] lea tx2q, [o(m(i%2_%4_internal).pass2)]
call m(i%1_%4_internal) call %%p1
RET RET
%%end:
%else
lea tx2q, [o(m(i%2_%4_internal).pass2)]
%if %3 > 0
cmp eobd, %3
jg %%p1
%elif %3 == 0
test eobd, eobd
jnz %%p1
%else
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align ALIGN function_align
%%end: %%end:
%endif
%endif
%endmacro %endmacro
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
...@@ -225,8 +240,7 @@ ALIGN function_align ...@@ -225,8 +240,7 @@ ALIGN function_align
punpcklwd m0, m0 punpcklwd m0, m0
punpckhdq m1, m0, m0 punpckhdq m1, m0, m0
punpckldq m0, m0 punpckldq m0, m0
call m(iadst_4x4_internal).end TAIL_CALL m(iadst_4x4_internal).end
RET
%elifidn %1_%2, identity_dct %elifidn %1_%2, identity_dct
mova m1, [coeffq+16*0] mova m1, [coeffq+16*0]
mova m2, [coeffq+16*1] mova m2, [coeffq+16*1]
...@@ -238,8 +252,7 @@ ALIGN function_align ...@@ -238,8 +252,7 @@ ALIGN function_align
pmulhrsw m0, [o(pw_5793x4)] pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m0, [o(pw_2896x8)] pmulhrsw m0, [o(pw_2896x8)]
mova m1, m0 mova m1, m0
call m(iadst_4x4_internal).end TAIL_CALL m(iadst_4x4_internal).end
RET
%elif %3 >= 0 %elif %3 >= 0
pshuflw m0, [coeffq], q0000 pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0 punpcklqdq m0, m0
...@@ -259,13 +272,11 @@ ALIGN function_align ...@@ -259,13 +272,11 @@ ALIGN function_align
pmulhrsw m0, m1 pmulhrsw m0, m1
%endif %endif
mova m1, m0 mova m1, m0
call m(iadst_4x4_internal).end2 TAIL_CALL m(iadst_4x4_internal).end2
RET
%else ; adst / flipadst %else ; adst / flipadst
pmulhrsw m1, m0, [o(iadst4_dconly2b)] pmulhrsw m1, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [o(iadst4_dconly2a)] pmulhrsw m0, [o(iadst4_dconly2a)]
call m(i%2_4x4_internal).end2 TAIL_CALL m(i%2_4x4_internal).end2
RET
%endif %endif
%endif %endif
%endmacro %endmacro
...@@ -624,8 +635,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff ...@@ -624,8 +635,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
punpckldq m0, m0 punpckldq m0, m0
punpckhdq m3, m2, m2 punpckhdq m3, m2, m2
punpckldq m2, m2 punpckldq m2, m2
call m(iadst_4x8_internal).end3 TAIL_CALL m(iadst_4x8_internal).end3
RET
%elifidn %1_%2, identity_dct %elifidn %1_%2, identity_dct
movd m0, [coeffq+16*0] movd m0, [coeffq+16*0]
punpcklwd m0, [coeffq+16*1] punpcklwd m0, [coeffq+16*1]
...@@ -642,8 +652,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff ...@@ -642,8 +652,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
mova m3, m0 mova m3, m0
call m(iadst_4x8_internal).end3 TAIL_CALL m(iadst_4x8_internal).end3
RET
%elifidn %1_%2, dct_dct %elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000 pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0 punpcklqdq m0, m0
...@@ -656,8 +665,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff ...@@ -656,8 +665,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
mova m3, m0 mova m3, m0
call m(iadst_4x8_internal).end4 TAIL_CALL m(iadst_4x8_internal).end4
RET
%else ; adst_dct / flipadst_dct %else ; adst_dct / flipadst_dct
pshuflw m0, [coeffq], q0000 pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0 punpcklqdq m0, m0
...@@ -674,8 +682,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff ...@@ -674,8 +682,7 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
mova m3, m0 mova m3, m0
call m(iadst_4x8_internal).end4 TAIL_CALL m(iadst_4x8_internal).end4
RET
%endif %endif
%endif %endif
%endmacro %endmacro
...@@ -923,8 +930,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -923,8 +930,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%endif %endif
%endif %endif
%endif %endif
call m(iadst_8x4_internal).end2 TAIL_CALL m(iadst_8x4_internal).end2
RET
%endif %endif
%endmacro %endmacro
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment