Commit 585ac462 authored by Liwei Wang's avatar Liwei Wang

Add SSSE3 implementation for the 8x32 and 32x8 blocks in itx

Cycle times:
inv_txfm_add_8x32_dct_dct_0_8bpc_c: 1164.7
inv_txfm_add_8x32_dct_dct_0_8bpc_ssse3: 79.5
inv_txfm_add_8x32_dct_dct_1_8bpc_c: 11291.6
inv_txfm_add_8x32_dct_dct_1_8bpc_ssse3: 508.5
inv_txfm_add_8x32_dct_dct_2_8bpc_c: 10720.4
inv_txfm_add_8x32_dct_dct_2_8bpc_ssse3: 507.9
inv_txfm_add_8x32_dct_dct_3_8bpc_c: 12351.5
inv_txfm_add_8x32_dct_dct_3_8bpc_ssse3: 687.2
inv_txfm_add_8x32_dct_dct_4_8bpc_c: 10402.3
inv_txfm_add_8x32_dct_dct_4_8bpc_ssse3: 687.9
inv_txfm_add_8x32_identity_identity_0_8bpc_c: 3485.0
inv_txfm_add_8x32_identity_identity_0_8bpc_ssse3: 97.7
inv_txfm_add_8x32_identity_identity_1_8bpc_c: 3495.7
inv_txfm_add_8x32_identity_identity_1_8bpc_ssse3: 97.7
inv_txfm_add_8x32_identity_identity_2_8bpc_c: 3503.7
inv_txfm_add_8x32_identity_identity_2_8bpc_ssse3: 97.8
inv_txfm_add_8x32_identity_identity_3_8bpc_c: 3489.5
inv_txfm_add_8x32_identity_identity_3_8bpc_ssse3: 184.4
inv_txfm_add_8x32_identity_identity_4_8bpc_c: 3498.1
inv_txfm_add_8x32_identity_identity_4_8bpc_ssse3: 182.8
inv_txfm_add_32x8_dct_dct_0_8bpc_c: 1220.4
inv_txfm_add_32x8_dct_dct_0_8bpc_ssse3: 65.6
inv_txfm_add_32x8_dct_dct_1_8bpc_c: 11120.7
inv_txfm_add_32x8_dct_dct_1_8bpc_ssse3: 623.8
inv_txfm_add_32x8_dct_dct_2_8bpc_c: 12236.3
inv_txfm_add_32x8_dct_dct_2_8bpc_ssse3: 624.7
inv_txfm_add_32x8_dct_dct_3_8bpc_c: 10866.3
inv_txfm_add_32x8_dct_dct_3_8bpc_ssse3: 694.1
inv_txfm_add_32x8_dct_dct_4_8bpc_c: 10322.8
inv_txfm_add_32x8_dct_dct_4_8bpc_ssse3: 692.5
inv_txfm_add_32x8_identity_identity_0_8bpc_c: 3368.1
inv_txfm_add_32x8_identity_identity_0_8bpc_ssse3: 98.6
inv_txfm_add_32x8_identity_identity_1_8bpc_c: 3381.1
inv_txfm_add_32x8_identity_identity_1_8bpc_ssse3: 98.3
inv_txfm_add_32x8_identity_identity_2_8bpc_c: 3376.6
inv_txfm_add_32x8_identity_identity_2_8bpc_ssse3: 98.3
inv_txfm_add_32x8_identity_identity_3_8bpc_c: 3364.3
inv_txfm_add_32x8_identity_identity_3_8bpc_ssse3: 182.2
inv_txfm_add_32x8_identity_identity_4_8bpc_c: 3390.0
inv_txfm_add_32x8_identity_identity_4_8bpc_ssse3: 182.2
parent 5d944dc6
...@@ -86,6 +86,8 @@ decl_itx16_fns(16, 4, ssse3); ...@@ -86,6 +86,8 @@ decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3); decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3); decl_itx16_fns(16, 8, ssse3);
decl_itx12_fns(16, 16, ssse3); decl_itx12_fns(16, 16, ssse3);
decl_itx2_fns ( 8, 32, ssse3);
decl_itx2_fns (32, 8, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
...@@ -138,6 +140,8 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { ...@@ -138,6 +140,8 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(R, 8, 16, ssse3); assign_itx16_fn(R, 8, 16, ssse3);
assign_itx16_fn(R, 16, 8, ssse3); assign_itx16_fn(R, 16, 8, ssse3);
assign_itx12_fn(, 16, 16, ssse3); assign_itx12_fn(, 16, 16, ssse3);
assign_itx2_fn (R, 8, 32, ssse3);
assign_itx2_fn (R, 32, 8, ssse3);
#endif #endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
...@@ -76,6 +76,23 @@ pw_3344x8: times 8 dw 3344*8 ...@@ -76,6 +76,23 @@ pw_3344x8: times 8 dw 3344*8
pw_5793x4: times 8 dw 5793*4 pw_5793x4: times 8 dw 5793*4
pw_8192: times 8 dw 8192 pw_8192: times 8 dw 8192
pw_m8192: times 8 dw -8192 pw_m8192: times 8 dw -8192
pw_5: times 8 dw 5
pw_201x8: times 8 dw 201*8
pw_4091x8: times 8 dw 4091*8
pw_m2751x8: times 8 dw -2751*8
pw_3035x8: times 8 dw 3035*8
pw_1751x8: times 8 dw 1751*8
pw_3703x8: times 8 dw 3703*8
pw_m1380x8: times 8 dw -1380*8
pw_3857x8: times 8 dw 3857*8
pw_995x8: times 8 dw 995*8
pw_3973x8: times 8 dw 3973*8
pw_m2106x8: times 8 dw -2106*8
pw_3513x8: times 8 dw 3513*8
pw_2440x8: times 8 dw 2440*8
pw_3290x8: times 8 dw 3290*8
pw_m601x8: times 8 dw -601*8
pw_4052x8: times 8 dw 4052*8
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
...@@ -1949,6 +1966,16 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -1949,6 +1966,16 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m6, [%1+%2*6] mova m6, [%1+%2*6]
%endmacro %endmacro
%macro SAVE_7ROWS 2 ;src, stride
mova [%1+%2*0], m0
mova [%1+%2*1], m1
mova [%1+%2*2], m2
mova [%1+%2*3], m3
mova [%1+%2*4], m4
mova [%1+%2*5], m5
mova [%1+%2*6], m6
%endmacro
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] %macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
punpckhwd m%5, m%4, m%1 ;packed in13 in3 punpckhwd m%5, m%4, m%1 ;packed in13 in3
punpcklwd m%1, m%4 ;packed in1 in15 punpcklwd m%1, m%4 ;packed in1 in15
...@@ -1993,7 +2020,7 @@ INV_TXFM_16X4_FN dct, flipadst, 0 ...@@ -1993,7 +2020,7 @@ INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3 INV_TXFM_16X4_FN dct, identity, 3
cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16 LOAD_7ROWS coeffq, 16
call .main call .main
.pass1_end: .pass1_end:
...@@ -2098,7 +2125,7 @@ INV_TXFM_16X4_FN adst, flipadst ...@@ -2098,7 +2125,7 @@ INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16 LOAD_7ROWS coeffq, 16
call .main call .main
punpckhwd m6, m7, m0 ;packed -out11, -out15 punpckhwd m6, m7, m0 ;packed -out11, -out15
...@@ -2236,7 +2263,7 @@ INV_TXFM_16X4_FN flipadst, flipadst ...@@ -2236,7 +2263,7 @@ INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16 LOAD_7ROWS coeffq, 16
call m(iadst_16x4_internal).main call m(iadst_16x4_internal).main
punpcklwd m6, m7, m0 ;packed out11, out15 punpcklwd m6, m7, m0 ;packed out11, out15
...@@ -2266,7 +2293,7 @@ INV_TXFM_16X4_FN identity, flipadst ...@@ -2266,7 +2293,7 @@ INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16 LOAD_7ROWS coeffq, 16
mova m7, [o(pw_5793x4)] mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6 REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
...@@ -2299,17 +2326,6 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2299,17 +2326,6 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova [%1+%2*7], m7 mova [%1+%2*7], m7
%endmacro %endmacro
%macro ITX_8X16_LOAD_STACK_COEFS 0
mova m0, [rsp+gprsize+16*3]
mova m1, [rsp+gprsize+16*4]
mova m2, [rsp+gprsize+16*5]
mova m3, [rsp+gprsize+16*6]
mova m4, [rsp+gprsize+16*7]
mova m5, [rsp+gprsize+16*8]
mova m6, [rsp+gprsize+16*9]
mova m7, [rsp+gprsize+32*5]
%endmacro
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12 INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
%ifidn %1_%2, dct_dct %ifidn %1_%2, dct_dct
...@@ -2435,14 +2451,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2435,14 +2451,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2_main: .pass2_main:
call m(idct_8x8_internal).main call m(idct_8x8_internal).main
mova [rsp+gprsize+16*3], m0 SAVE_7ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*4], m1
mova [rsp+gprsize+16*5], m2
mova [rsp+gprsize+16*6], m3
mova [rsp+gprsize+16*7], m4
mova [rsp+gprsize+16*8], m5
mova [rsp+gprsize+16*9], m6
mova m0, [coeffq+16*2 ] mova m0, [coeffq+16*2 ]
mova m1, [coeffq+16*6 ] mova m1, [coeffq+16*6 ]
mova m2, [coeffq+16*10] mova m2, [coeffq+16*10]
...@@ -2458,7 +2467,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2458,7 +2467,7 @@ cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).end jmp m(idct_8x8_internal).end
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_8x16_internal).end1)] lea tx2q, [o(m(idct_8x16_internal).end1)]
mov dstq, r3 mov dstq, r3
...@@ -2512,7 +2521,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2512,7 +2521,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iadst_8x8_internal).end jmp m(iadst_8x8_internal).end
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_8x16_internal).end1)] lea tx2q, [o(m(idct_8x16_internal).end1)]
mov dstq, r3 mov dstq, r3
...@@ -2560,7 +2569,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2560,7 +2569,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iflipadst_8x8_internal).end jmp m(iflipadst_8x8_internal).end
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_8x16_internal).end1)] lea tx2q, [o(m(idct_8x16_internal).end1)]
mov dstq, r3 mov dstq, r3
...@@ -2703,13 +2712,7 @@ INV_TXFM_16X8_FN dct, flipadst ...@@ -2703,13 +2712,7 @@ INV_TXFM_16X8_FN dct, flipadst
cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*0, 32, 1 LOAD_8ROWS coeffq+16*0, 32, 1
call m(idct_8x8_internal).main call m(idct_8x8_internal).main
mova [rsp+gprsize+16*3], m0 SAVE_7ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*4], m1
mova [rsp+gprsize+16*5], m2
mova [rsp+gprsize+16*6], m3
mova [rsp+gprsize+16*7], m4
mova [rsp+gprsize+16*8], m5
mova [rsp+gprsize+16*9], m6
LOAD_8ROWS coeffq+16*1, 32, 1 LOAD_8ROWS coeffq+16*1, 32, 1
call .main call .main
...@@ -2719,7 +2722,7 @@ cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2719,7 +2722,7 @@ cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: .pass1_end:
SAVE_8ROWS coeffq+16*1, 32 SAVE_8ROWS coeffq+16*1, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mov tx2q, r3 mov tx2q, r3
jmp m(idct_8x8_internal).pass1_end jmp m(idct_8x8_internal).pass1_end
...@@ -2863,7 +2866,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -2863,7 +2866,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: .pass1_end:
SAVE_8ROWS coeffq+16*1, 32 SAVE_8ROWS coeffq+16*1, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mov tx2q, r3 mov tx2q, r3
jmp m(iadst_8x8_internal).pass1_end jmp m(iadst_8x8_internal).pass1_end
...@@ -3067,7 +3070,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3067,7 +3070,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m7, [rsp+gprsize+16*0] mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32 SAVE_8ROWS coeffq+16*0, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mov r3, tx2q mov r3, tx2q
lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)]
...@@ -3098,15 +3101,7 @@ INV_TXFM_16X8_FN identity, flipadst ...@@ -3098,15 +3101,7 @@ INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m7, [o(pw_2896x8)] LOAD_8ROWS coeffq+16*8, 16, 1
pmulhrsw m0, m7, [coeffq+16*8 ]
pmulhrsw m1, m7, [coeffq+16*9 ]
pmulhrsw m2, m7, [coeffq+16*10]
pmulhrsw m3, m7, [coeffq+16*11]
pmulhrsw m4, m7, [coeffq+16*12]
pmulhrsw m5, m7, [coeffq+16*13]
pmulhrsw m6, m7, [coeffq+16*14]
pmulhrsw m7, [coeffq+16*15]
mov r3, tx2q mov r3, tx2q
lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
...@@ -3266,30 +3261,10 @@ INV_TXFM_16X16_FN dct, adst ...@@ -3266,30 +3261,10 @@ INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, flipadst
cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*1 ] LOAD_8ROWS coeffq+16*1, 64
mova m1, [coeffq+16*5 ]
mova m2, [coeffq+16*9 ]
mova m3, [coeffq+16*13]
mova m4, [coeffq+16*17]
mova m5, [coeffq+16*21]
mova m6, [coeffq+16*25]
mova m7, [coeffq+16*29]
call m(idct_8x8_internal).main call m(idct_8x8_internal).main
mova [rsp+gprsize+16*3], m0 SAVE_7ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*4], m1 LOAD_8ROWS coeffq+16*3, 64
mova [rsp+gprsize+16*5], m2
mova [rsp+gprsize+16*6], m3
mova [rsp+gprsize+16*7], m4
mova [rsp+gprsize+16*8], m5
mova [rsp+gprsize+16*9], m6
mova m0, [coeffq+16*3 ]
mova m1, [coeffq+16*7 ]
mova m2, [coeffq+16*11]
mova m3, [coeffq+16*15]
mova m4, [coeffq+16*19]
mova m5, [coeffq+16*23]
mova m6, [coeffq+16*27]
mova m7, [coeffq+16*31]
call m(idct_16x8_internal).main call m(idct_16x8_internal).main
mov r3, tx2q mov r3, tx2q
lea tx2q, [o(m(idct_16x16_internal).pass1_end)] lea tx2q, [o(m(idct_16x16_internal).pass1_end)]
...@@ -3298,7 +3273,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3298,7 +3273,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: .pass1_end:
SAVE_8ROWS coeffq+16*17, 32 SAVE_8ROWS coeffq+16*17, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] lea tx2q, [o(m(idct_16x16_internal).pass1_end1)]
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -3306,30 +3281,10 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3306,30 +3281,10 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end1: .pass1_end1:
SAVE_8ROWS coeffq+16*1, 32 SAVE_8ROWS coeffq+16*1, 32
mova m0, [coeffq+16*0 ] LOAD_8ROWS coeffq+16*0, 64
mova m1, [coeffq+16*4 ]
mova m2, [coeffq+16*8 ]
mova m3, [coeffq+16*12]
mova m4, [coeffq+16*16]
mova m5, [coeffq+16*20]
mova m6, [coeffq+16*24]
mova m7, [coeffq+16*28]
call m(idct_8x8_internal).main call m(idct_8x8_internal).main
mova [rsp+gprsize+16*3], m0 SAVE_7ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*4], m1 LOAD_8ROWS coeffq+16*2, 64
mova [rsp+gprsize+16*5], m2
mova [rsp+gprsize+16*6], m3
mova [rsp+gprsize+16*7], m4
mova [rsp+gprsize+16*8], m5
mova [rsp+gprsize+16*9], m6
mova m0, [coeffq+16*2 ]
mova m1, [coeffq+16*6 ]
mova m2, [coeffq+16*10]
mova m3, [coeffq+16*14]
mova m4, [coeffq+16*18]
mova m5, [coeffq+16*22]
mova m6, [coeffq+16*26]
mova m7, [coeffq+16*30]
call m(idct_16x8_internal).main call m(idct_16x8_internal).main
lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] lea tx2q, [o(m(idct_16x16_internal).pass1_end2)]
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -3337,7 +3292,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3337,7 +3292,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end2: .pass1_end2:
SAVE_8ROWS coeffq+16*16, 32 SAVE_8ROWS coeffq+16*16, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mov tx2q, r3 mov tx2q, r3
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -3348,7 +3303,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3348,7 +3303,7 @@ cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x16_internal).pass2_pre jmp m(idct_8x16_internal).pass2_pre
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_16x16_internal).end1)] lea tx2q, [o(m(idct_16x16_internal).end1)]
mov dstq, r3 mov dstq, r3
...@@ -3443,7 +3398,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3443,7 +3398,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: .pass1_end:
SAVE_8ROWS coeffq+16*17, 32 SAVE_8ROWS coeffq+16*17, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)]
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -3460,7 +3415,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3460,7 +3415,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end2: .pass1_end2:
SAVE_8ROWS coeffq+16*16, 32 SAVE_8ROWS coeffq+16*16, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mov tx2q, r3 mov tx2q, r3
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -3471,7 +3426,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3471,7 +3426,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iadst_8x16_internal).pass2_pre jmp m(iadst_8x16_internal).pass2_pre
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(iadst_16x16_internal).end1)] lea tx2q, [o(m(iadst_16x16_internal).end1)]
mov dstq, r3 mov dstq, r3
...@@ -3516,7 +3471,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3516,7 +3471,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: .pass1_end:
SAVE_8ROWS coeffq+16*1, 32 SAVE_8ROWS coeffq+16*1, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)]
mova m7, [o(pw_m8192)] mova m7, [o(pw_m8192)]
...@@ -3529,7 +3484,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3529,7 +3484,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m7, [rsp+gprsize+16*0] mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32 SAVE_8ROWS coeffq+16*0, 32
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)]
mova m7, [o(pw_m8192)] mova m7, [o(pw_m8192)]
...@@ -3549,7 +3504,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3549,7 +3504,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iflipadst_8x16_internal).pass2_pre jmp m(iflipadst_8x16_internal).pass2_pre
.end: .end:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(iflipadst_16x16_internal).end1)] lea tx2q, [o(m(iflipadst_16x16_internal).end1)]
lea dstq, [dstq+strideq*2] lea dstq, [dstq+strideq*2]
...@@ -3579,7 +3534,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3579,7 +3534,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iflipadst_8x16_internal).pass2_main jmp m(iflipadst_8x16_internal).pass2_main
.end2: .end2:
ITX_8X16_LOAD_STACK_COEFS LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_8x16_internal).end1)] lea tx2q, [o(m(idct_8x16_internal).end1)]
lea dstq, [dstq+strideq*2] lea dstq, [dstq+strideq*2]
...@@ -3661,3 +3616,686 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -3661,3 +3616,686 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea tx2q, [o(m(idct_8x16_internal).end1)] lea tx2q, [o(m(idct_8x16_internal).end1)]
lea dstq, [dstq+strideq*2] lea dstq, [dstq+strideq*2]
jmp .end jmp .end
cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
%if ARCH_X86_32
LEA r5, $$
%endif
test eobd, eobd
jz .dconly
call m(idct_8x32_internal)
RET
.dconly:
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
movd m2, [o(pw_8192)]
mov [coeffq], eobd
pmulhrsw m0, m2
psrlw m2, 2 ;pw_2048
pmulhrsw m0, m1
pmulhrsw m0, m2
pshuflw m0, m0, q0000
punpcklwd m0, m0
mov r3d, 8
lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)]
jmp m(inv_txfm_add_dct_dct_8x8).loop
.end:
RET
cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%undef cmp
cmp eobd, 106
jle .fast
LOAD_8ROWS coeffq+16*3, 64
call m(idct_8x8_internal).main
mova m7, [o(pw_8192)]
lea tx2q, [o(m(idct_8x32_internal).pass1)]
jmp m(idct_8x8_internal).pass1_end1
.pass1:
mova [rsp+gprsize+16*9 ], m0 ;in24
mova [rsp+gprsize+16*10], m4 ;in28
mova [rsp+gprsize+16*17], m2 ;in26
mova [rsp+gprsize+16*18], m6 ;in30
mova [rsp+gprsize+16*31], m1 ;in25
mova [rsp+gprsize+16*30], m3 ;in27
mova [rsp+gprsize+16*27], m5 ;in29
mova [rsp+gprsize+16*34], m7 ;in31
LOAD_8ROWS coeffq+16*2, 64
call m(idct_8x8_internal).main
mova m7, [o(pw_8192)]
lea tx2q, [o(m(idct_8x32_internal).pass1_1)]
jmp m(idct_8x8_internal).pass1_end1
.pass1_1:
mova [rsp+gprsize+16*7 ], m0 ;in16
mova [rsp+gprsize+16*8 ], m4 ;in20
mova [rsp+gprsize+16*15], m2 ;in18
mova [rsp+gprsize+16*16], m6 ;in22
mova [rsp+gprsize+16*33], m1 ;in17
mova [rsp+gprsize+16*28], m3 ;in19
mova [rsp+gprsize+16*29], m5 ;in21
mova [rsp+gprsize+16*32], m7 ;in23
.fast:
LOAD_8ROWS coeffq+16*1, 64
call m(idct_8x8_internal).main
mova m7, [o(pw_8192)]
lea tx2q, [o(m(idct_8x32_internal).pass1_end)]
jmp m(idct_8x8_internal).pass1_end1
.pass1_end:
mova [rsp+gprsize+16*5 ], m0 ;in8
mova [rsp+gprsize+16*6 ], m4 ;in12
mova [rsp+gprsize+16*13], m2 ;in10
mova [rsp+gprsize+16*14], m6 ;in14
mova [rsp+gprsize+16*21], m1 ;in9
mova [rsp+gprsize+16*24], m3 ;in11
mova [rsp+gprsize+16*25], m5 ;in13
mova [rsp+gprsize+16*20], m7 ;in15
LOAD_8ROWS coeffq+16*0, 64
call m(idct_8x8_internal).main
mova m7, [o(pw_8192)]
lea tx2q, [o(m(idct_8x32_internal).pass1_end1)]
jmp m(idct_8x8_internal).pass1_end1
.pass1_end1:
mova [rsp+gprsize+16*11], m2 ;in2
mova [rsp+gprsize+16*12], m6 ;in6
mova [rsp+gprsize+16*19], m1 ;in1
mova [rsp+gprsize+16*26], m3 ;in3
mova [rsp+gprsize+16*23], m5 ;in5
mova [rsp+gprsize+16*22], m7 ;in7
mova m1, m4 ;in4
mova m2, [rsp+gprsize+16*5 ] ;in8
mova m3, [rsp+gprsize+16*6 ] ;in12
cmp eobd, 106
jg .full
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m(idct_8x8_internal).main
SAVE_7ROWS rsp+gprsize+16*3 , 16
mova m0, [rsp+gprsize+16*11]
mova m1, [rsp+gprsize+16*12]
mova m2, [rsp+gprsize+16*13]
mova m3, [rsp+gprsize+16*14]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m(idct_16x8_internal).main
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS rsp+gprsize+16*11, 16
call .main_fast
jmp .pass2
.full:
mova m4, [rsp+gprsize+16*7 ] ;in16
mova m5, [rsp+gprsize+16*8 ] ;in20
mova m6, [rsp+gprsize+16*9 ] ;in24
mova m7, [rsp+gprsize+16*10] ;in28
call m(idct_8x8_internal).main
SAVE_7ROWS rsp+gprsize+16*3 , 16
LOAD_8ROWS rsp+gprsize+16*11, 16
call m(idct_16x8_internal).main
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS rsp+gprsize+16*11, 16
call .main
.pass2:
mova [rsp+gprsize+16*0 ], m7
lea tx2q, [o(m(idct_8x32_internal).end1)]
.end:
pxor m7, m7
REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14, 15, \
16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
jmp tx2q
.end1:
lea tx2q, [o(m(idct_8x32_internal).end2)]
jmp m(idct_8x8_internal).end
.end2:
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0 ], m7
lea dstq, [dstq+strideq*2]
lea tx2q, [o(m(idct_8x32_internal).end3)]
jmp m(idct_8x8_internal).end
.end3:
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0 ], m7
lea dstq, [dstq+strideq*2]
lea tx2q, [o(m(idct_8x32_internal).end4)]
jmp m(idct_8x8_internal).end
.end4:
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0 ], m7
lea dstq, [dstq+strideq*2]
lea tx2q, [o(m(idct_8x32_internal).end5)]
jmp m(idct_8x8_internal).end
.end5:
ret
ALIGN function_align
.main_fast: ;bottom half is zero
mova m0, [rsp+gprsize*2+16*19] ;in1
mova m1, [rsp+gprsize*2+16*20] ;in15
pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a
pmulhrsw m0, [o(pw_201x8)] ;t16a
pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a
pmulhrsw m1, [o(pw_m2751x8)] ;t17a
mova m7, [o(pd_2048)]
psubsw m4, m0, m1 ;t17
paddsw m0, m1 ;t16
psubsw m5, m3, m2 ;t30
paddsw m3, m2 ;t31
ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
mova [rsp+gprsize*2+16*19], m0 ;t16
mova [rsp+gprsize*2+16*20], m5 ;t17a