Commit 9eb6ec9f authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Fix a buffer overread on odd input resolutions

parent 89aa4e87
...@@ -317,7 +317,7 @@ MC_COPY( 16 ) ...@@ -317,7 +317,7 @@ MC_COPY( 16 )
MC_COPY( 8 ) MC_COPY( 8 )
MC_COPY( 4 ) MC_COPY( 4 )
static void plane_copy( uint8_t *dst, int i_dst, void x264_plane_copy_c( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h) uint8_t *src, int i_src, int w, int h)
{ {
while( h-- ) while( h-- )
...@@ -483,7 +483,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) ...@@ -483,7 +483,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4; pf->copy[PIXEL_4x4] = mc_copy_w4;
pf->plane_copy = plane_copy; pf->plane_copy = x264_plane_copy_c;
pf->hpel_filter = hpel_filter; pf->hpel_filter = hpel_filter;
pf->prefetch_fenc = prefetch_fenc_null; pf->prefetch_fenc = prefetch_fenc_null;
......
...@@ -598,48 +598,44 @@ cglobal x264_sfence ...@@ -598,48 +598,44 @@ cglobal x264_sfence
ret ret
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, ; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h) ; uint8_t *src, int i_src, int w, int h)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_plane_copy_mmxext, 6,7 ; assumes i_dst and w are multiples of 16, and i_dst>w
cglobal x264_plane_copy_core_mmxext, 6,7
movsxdifnidn r1, r1d movsxdifnidn r1, r1d
movsxdifnidn r3, r3d movsxdifnidn r3, r3d
add r4d, 3 movsxdifnidn r4, r4d
and r4d, ~3 sub r1, r4
mov r6d, r4d sub r3, r4
and r6d, ~15
sub r1, r6
sub r3, r6
.loopy: .loopy:
mov r6d, r4d mov r6d, r4d
sub r6d, 64 sub r6d, 63
jl .endx
.loopx: .loopx:
prefetchnta [r2+256] prefetchnta [r2+256]
movq mm0, [r2 ] movq mm0, [r2 ]
movq mm1, [r2+ 8] movq mm1, [r2+ 8]
movq mm2, [r2+16]
movq mm3, [r2+24]
movq mm4, [r2+32]
movq mm5, [r2+40]
movq mm6, [r2+48]
movq mm7, [r2+56]
movntq [r0 ], mm0 movntq [r0 ], mm0
movntq [r0+ 8], mm1 movntq [r0+ 8], mm1
movq mm2, [r2+16]
movq mm3, [r2+24]
movntq [r0+16], mm2 movntq [r0+16], mm2
movntq [r0+24], mm3 movntq [r0+24], mm3
movq mm4, [r2+32]
movq mm5, [r2+40]
movntq [r0+32], mm4 movntq [r0+32], mm4
movntq [r0+40], mm5 movntq [r0+40], mm5
movq mm6, [r2+48]
movq mm7, [r2+56]
movntq [r0+48], mm6 movntq [r0+48], mm6
movntq [r0+56], mm7 movntq [r0+56], mm7
add r2, 64 add r2, 64
add r0, 64 add r0, 64
sub r6d, 64 sub r6d, 64
jge .loopx jg .loopx
.endx:
prefetchnta [r2+256] prefetchnta [r2+256]
add r6d, 48 add r6d, 63
jl .end16 jle .end16
.loop16: .loop16:
movq mm0, [r2 ] movq mm0, [r2 ]
movq mm1, [r2+8] movq mm1, [r2+8]
...@@ -648,20 +644,12 @@ cglobal x264_plane_copy_mmxext, 6,7 ...@@ -648,20 +644,12 @@ cglobal x264_plane_copy_mmxext, 6,7
add r2, 16 add r2, 16
add r0, 16 add r0, 16
sub r6d, 16 sub r6d, 16
jge .loop16 jg .loop16
.end16: .end16:
add r6d, 12
jl .end4
.loop4:
movd mm2, [r2+r6]
movd [r0+r6], mm2
sub r6d, 4
jge .loop4
.end4:
add r2, r3
add r0, r1 add r0, r1
add r2, r3
dec r5d dec r5d
jg .loopy jg .loopy
sfence sfence
emms emms
RET RET
......
...@@ -88,7 +88,8 @@ extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, ...@@ -88,7 +88,8 @@ extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride, uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height ); int dx, int dy, int i_width, int i_height );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n ); extern void x264_memzero_aligned_mmx( void * dst, int n );
...@@ -339,10 +340,23 @@ void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_ ...@@ -339,10 +340,23 @@ void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_
#else #else
HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3) HPEL(16, ssse3, ssse3, ssse3, ssse3)
#endif #endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h)
{
if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
} else if(i_src > 0) {
// have to use plain memcpy on the last line (in memory order) to avoid overreading src
x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 );
memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w );
} else {
memcpy( dst, src, w );
x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
}
}
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{ {
if( !(cpu&X264_CPU_MMX) ) if( !(cpu&X264_CPU_MMX) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment