Commit 11da4086 authored by Henrik Gramner's avatar Henrik Gramner Committed by Henrik Gramner

Fix buffer overflow in 64x16 ssse3 idct

With frame threading enabled the code could previously clobber the
coefficients of the next block.

Update the checkasm test to check for this.
parent 94e30ef9
Pipeline #6826 passed with stages
in 6 minutes and 41 seconds
...@@ -6097,7 +6097,7 @@ ALIGN function_align ...@@ -6097,7 +6097,7 @@ ALIGN function_align
cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
%if ARCH_X86_32 %if ARCH_X86_32
LEA r5, $$ LEA r5, $$
%endif %endif
...@@ -6186,7 +6186,9 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 ...@@ -6186,7 +6186,9 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
%endmacro %endmacro
cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mov r3, 2 mov r3d, 2
mov [rsp+gprsize*2+16*67], dstq
lea dstq, [rsp+gprsize+16*68]
.pass1_loop: .pass1_loop:
LOAD_4ROWS coeffq+32*0, 32*8 LOAD_4ROWS coeffq+32*0, 32*8
...@@ -6277,7 +6279,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -6277,7 +6279,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1 jmp m(idct_8x8_internal).pass1_end1
.pass1_end4: .pass1_end4:
SAVE_8ROWS coeffq+32*32, 32 SAVE_8ROWS dstq+32*0, 32
LOAD_8ROWS rsp+gprsize+16*43, 16 LOAD_8ROWS rsp+gprsize+16*43, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -6285,7 +6287,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -6285,7 +6287,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1 jmp m(idct_8x8_internal).pass1_end1
.pass1_end5: .pass1_end5:
SAVE_8ROWS coeffq+32*40, 32 SAVE_8ROWS dstq+32*8, 32
LOAD_8ROWS rsp+gprsize+16*51, 16 LOAD_8ROWS rsp+gprsize+16*51, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -6293,7 +6295,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -6293,7 +6295,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1 jmp m(idct_8x8_internal).pass1_end1
.pass1_end6: .pass1_end6:
SAVE_8ROWS coeffq+32*48, 32 SAVE_8ROWS dstq+32*16, 32
LOAD_8ROWS rsp+gprsize+16*59, 16 LOAD_8ROWS rsp+gprsize+16*59, 16
mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)] mova m7, [o(pw_8192)]
...@@ -6301,20 +6303,20 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -6301,20 +6303,20 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1 jmp m(idct_8x8_internal).pass1_end1
.pass1_end7: .pass1_end7:
SAVE_8ROWS coeffq+32*56, 32 SAVE_8ROWS dstq+32*24, 32
add coeffq, 16 add coeffq, 16
dec r3 add dstq, 16
dec r3d
jg .pass1_loop jg .pass1_loop
.pass2: .pass2:
mov dstq, [rsp+gprsize*2+16*67]
sub coeffq, 32 sub coeffq, 32
mov r3, 8 mov r3d, 4
lea r4, [dstq+8]
mov [rsp+gprsize*2+16*67], r4
.pass2_loop: .pass2_loop:
mov [rsp+gprsize*1+16*67], r3 mov [rsp+gprsize*1+16*67], r3d
LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS coeffq+16*0, 32*2
LOAD_4ROWS_H coeffq+16*1, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2
...@@ -6341,13 +6343,47 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -6341,13 +6343,47 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add coeffq, 16*16 add coeffq, 16*16
mov r3, [rsp+gprsize*1+16*67] mov r3d, [rsp+gprsize*1+16*67]
mov dstq, [rsp+gprsize*2+16*67] mov dstq, [rsp+gprsize*2+16*67]
lea r4, [dstq+8] add dstq, 8
mov [rsp+gprsize*2+16*67], r4 mov [rsp+gprsize*2+16*67], dstq
dec r3d
dec r3
jg .pass2_loop jg .pass2_loop
mov r3d, 4
lea coeffq, [rsp+gprsize+16*68]
.pass2_loop2:
mov [rsp+gprsize*1+16*67], r3d
LOAD_4ROWS coeffq+16*0, 32*2
LOAD_4ROWS_H coeffq+16*1, 32*2
call m(idct_8x8_internal).main
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_4ROWS coeffq+16*2, 32*2
LOAD_4ROWS_H coeffq+16*3, 32*2
call m(idct_16x8_internal).main
mov r3, dstq
lea tx2q, [o(m(idct_64x16_internal).end2)]
lea dstq, [dstq+strideq*8]
jmp m(idct_8x8_internal).end
.end2:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
lea tx2q, [o(m(idct_64x16_internal).end3)]
mov dstq, r3
jmp m(idct_8x8_internal).end
.end3:
add coeffq, 16*16
mov r3d, [rsp+gprsize*1+16*67]
mov dstq, [rsp+gprsize*2+16*67]
add dstq, 8
mov [rsp+gprsize*2+16*67], dstq
dec r3d
jg .pass2_loop2
ret ret
......
...@@ -158,6 +158,8 @@ static int copy_subcoefs(coef *coeff, ...@@ -158,6 +158,8 @@ static int copy_subcoefs(coef *coeff,
eob += rnd() % (n - eob - 1); eob += rnd() % (n - eob - 1);
for (n = eob + 1; n < sw * sh; n++) for (n = eob + 1; n < sw * sh; n++)
coeff[scan[n]] = 0; coeff[scan[n]] = 0;
for (; n < 32 * 32; n++)
coeff[n] = rnd();
return eob; return eob;
} }
...@@ -224,7 +226,7 @@ void bitfn(checkasm_check_itx)(void) { ...@@ -224,7 +226,7 @@ void bitfn(checkasm_check_itx)(void) {
Dav1dInvTxfmDSPContext c; Dav1dInvTxfmDSPContext c;
bitfn(dav1d_itx_dsp_init)(&c); bitfn(dav1d_itx_dsp_init)(&c);
ALIGN_STK_32(coef, coeff, 3, [32 * 32]); ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
ALIGN_STK_32(pixel, c_dst, 64 * 64,); ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,); ALIGN_STK_32(pixel, a_dst, 64 * 64,);
...@@ -245,7 +247,6 @@ void bitfn(checkasm_check_itx)(void) { ...@@ -245,7 +247,6 @@ void bitfn(checkasm_check_itx)(void) {
const enum RectTxfmSize tx = txfm_size_order[i]; const enum RectTxfmSize tx = txfm_size_order[i];
const int w = dav1d_txfm_dimensions[tx].w * 4; const int w = dav1d_txfm_dimensions[tx].w * 4;
const int h = dav1d_txfm_dimensions[tx].h * 4; const int h = dav1d_txfm_dimensions[tx].h * 4;
const int sw = imin(w, 32), sh = imin(h, 32);
const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw, const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
dav1d_txfm_dimensions[tx].lh)]; dav1d_txfm_dimensions[tx].lh)];
...@@ -263,24 +264,22 @@ void bitfn(checkasm_check_itx)(void) { ...@@ -263,24 +264,22 @@ void bitfn(checkasm_check_itx)(void) {
const int bitdepth_max = 0xff; const int bitdepth_max = 0xff;
#endif #endif
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
memcpy(coeff[1], coeff[0], sizeof(*coeff));
for (int j = 0; j < w * h; j++) for (int j = 0; j < w * h; j++)
c_dst[j] = a_dst[j] = rnd() & bitdepth_max; c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));
memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX); HIGHBD_TAIL_SUFFIX);
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
HIGHBD_TAIL_SUFFIX); HIGHBD_TAIL_SUFFIX);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) || if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||
memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff))) memcmp(coeff[0], coeff[1], sizeof(*coeff)))
{ {
fail(); fail();
} }
bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX); HIGHBD_TAIL_SUFFIX);
} }
report("add_%dx%d", w, h); report("add_%dx%d", w, h);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment