Commit a6cee0ab authored by Fiona Glaser's avatar Fiona Glaser

Align lowres planes for improved cacheline split performance

parent 579e930f
......@@ -23,6 +23,8 @@
#include "common.h"
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
x264_frame_t *x264_frame_new( x264_t *h )
{
x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
......@@ -32,22 +34,16 @@ x264_frame_t *x264_frame_new( x264_t *h )
int i_stride, i_width, i_lines;
int i_padv = PADV << h->param.b_interlaced;
int luma_plane_size;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
if( !frame ) return NULL;
memset( frame, 0, sizeof(x264_frame_t) );
/* allocate frame data (+64 for extra data for me) */
i_width = ( ( h->param.i_width + 15 ) & -16 );
i_stride = i_width + 2*PADH;
i_lines = ( ( h->param.i_height + 15 ) & -16 );
if( h->param.b_interlaced )
i_lines = ( i_lines + 31 ) & -32;
if( h->param.cpu&X264_CPU_CACHELINE_64 )
i_stride = (i_stride + 63) & ~63;
else if( h->param.cpu&X264_CPU_CACHELINE_32 )
i_stride = (i_stride + 31) & ~31;
i_width = ALIGN( h->param.i_width, 16 );
i_stride = ALIGN( i_width + 2*PADH, align );
i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
......@@ -61,27 +57,26 @@ x264_frame_t *x264_frame_new( x264_t *h )
for( i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
for( i = 0; i < 4; i++ )
frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->plane[0] = frame->filtered[0];
if( h->frames.b_have_lowres )
{
frame->i_width_lowres = frame->i_width[0]/2;
frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
frame->i_lines_lowres = frame->i_lines[0]/2;
luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
for( i = 0; i < 4; i++ )
{
CHECKED_MALLOC( frame->buffer_lowres[i],
frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
frame->i_stride_lowres * i_padv + PADH;
}
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
}
if( h->param.analyse.i_me_method >= X264_ME_ESA )
......
......@@ -56,8 +56,8 @@ typedef struct
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
void *buffer[4];
void *buffer_lowres[4];
uint8_t *buffer[4];
uint8_t *buffer_lowres[4];
/* motion data */
int8_t *mb_type;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment