...
 
Commits (8)
......@@ -38,15 +38,16 @@
#define ATTR_ALIAS
#endif
#if ARCH_X86
#if ARCH_X86_64
/* x86-64 needs 32-byte alignment for AVX2. */
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_ARM || ARCH_AARCH64
// ARM doesn't benefit from anything more than 16 byte alignment.
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
#else
// No need for extra alignment on platforms without assembly.
/* No need for extra alignment on platforms without assembly. */
#define ALIGN_32_VAL 8
#define ALIGN_16_VAL 8
#endif
......
......@@ -147,6 +147,9 @@ if (host_machine.cpu_family() == 'aarch64' or
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_GETAUXVAL', 1)
endif
if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_ELF_AUX_INFO', 1)
endif
endif
# Compiler flag tests
......@@ -167,6 +170,12 @@ optional_arguments = [
'-Wno-unused-parameter',
'-Werror=missing-prototypes',
]
if cc.get_id() == 'msvc'
optional_arguments += [
'-wd4028', # parameter different from declaration
'-wd4996' # use of POSIX functions
]
endif
if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
optional_arguments += '-fomit-frame-pointer'
......@@ -191,30 +200,35 @@ stackalign_flag = []
stackrealign_flag = []
if host_machine.cpu_family().startswith('x86')
if cc.has_argument('-mpreferred-stack-boundary=5')
stackalign_flag = ['-mpreferred-stack-boundary=5']
stackrealign_flag = ['-mincoming-stack-boundary=4']
cdata_asm.set('STACK_ALIGNMENT', 32)
cdata.set('STACK_ALIGNMENT', 32)
elif cc.has_argument('-mpreferred-stack-boundary=4')
stackalign_flag = ['-mpreferred-stack-boundary=4']
stackrealign_flag = ['-mincoming-stack-boundary=4']
cdata_asm.set('STACK_ALIGNMENT', 16)
cdata.set('STACK_ALIGNMENT', 16)
elif cc.has_argument('-mstack-alignment=32')
stackalign_flag = ['-mstack-alignment=32']
stackrealign_flag = ['-mstackrealign']
cdata_asm.set('STACK_ALIGNMENT', 32)
cdata.set('STACK_ALIGNMENT', 32)
if host_machine.cpu_family() == 'x86_64'
if cc.has_argument('-mpreferred-stack-boundary=5')
stackalign_flag = ['-mpreferred-stack-boundary=5']
stackrealign_flag = ['-mincoming-stack-boundary=4']
stack_alignment = 32
elif cc.has_argument('-mstack-alignment=32')
stackalign_flag = ['-mstack-alignment=32']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 32
else
stack_alignment = 16
endif
else
if host_machine.cpu_family() == 'x86_64'
cdata_asm.set('STACK_ALIGNMENT', 16)
cdata.set('STACK_ALIGNMENT', 16)
if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
stack_alignment = 16
elif cc.has_argument('-mpreferred-stack-boundary=4')
stackalign_flag = ['-mpreferred-stack-boundary=4']
stackrealign_flag = ['-mincoming-stack-boundary=2']
stack_alignment = 16
elif cc.has_argument('-mstack-alignment=16')
stackalign_flag = ['-mstack-alignment=16']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 16
else
cdata_asm.set('STACK_ALIGNMENT', 4)
cdata.set('STACK_ALIGNMENT', 4)
stack_alignment = 4
endif
endif
cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
cdata.set('STACK_ALIGNMENT', stack_alignment)
endif
cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64')
......
......@@ -37,6 +37,11 @@
#endif
#define NEON_HWCAP HWCAP_ARM_NEON
#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
#include <sys/auxv.h>
#define NEON_HWCAP HWCAP_NEON
#elif defined(__ANDROID__)
#include <stdio.h>
#include <string.h>
......@@ -72,9 +77,15 @@ unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = 0;
#if ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#elif defined(__ARM_NEON)
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
unsigned long hw_cap = 0;
elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
#elif defined(__ANDROID__)
flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
#elif defined(__APPLE__)
......
......@@ -39,7 +39,7 @@
#include "src/data.h"
#include "src/ref.h"
uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
validate_input_or_ret(buf != NULL, NULL);
buf->ref = dav1d_ref_create(sz);
......@@ -53,9 +53,11 @@ uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
return buf->ref->data;
}
int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t sz,
void (*free_callback)(const uint8_t *data, void *user_data),
void *user_data)
int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
const size_t sz,
void (*const free_callback)(const uint8_t *data,
void *user_data),
void *const user_data)
{
validate_input_or_ret(buf != NULL, -EINVAL);
validate_input_or_ret(ptr != NULL, -EINVAL);
......@@ -84,7 +86,7 @@ void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
memset(src, 0, sizeof(*src));
}
void dav1d_data_unref(Dav1dData *const buf) {
void dav1d_data_unref_internal(Dav1dData *const buf) {
validate_input(buf != NULL);
if (buf->ref) {
......
......@@ -35,4 +35,11 @@
*/
void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
void (*free_callback)(const uint8_t *data,
void *user_data),
void *user_data);
void dav1d_data_unref_internal(Dav1dData *buf);
#endif /* __DAV1D_SRC_DATA_H__ */
......@@ -1264,16 +1264,17 @@ static int decode_b(Dav1dTileContext *const t,
}
int src_left = t->bx * 4 + (b->mv[0].x >> 3);
int src_top = t->by * 4 + (b->mv[0].y >> 3);
int src_right = src_left + w4 * 4;
int src_bottom = src_top + h4 * 4;
int src_right = src_left + bw4 * 4;
int src_bottom = src_top + bh4 * 4;
const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
// check against left or right tile boundary and adjust if necessary
if (src_left < border_left) {
src_right += border_left - src_left;
src_left += border_left - src_left;
} else if (src_right > ts->tiling.col_end * 4) {
src_left -= src_right - ts->tiling.col_end * 4;
src_right -= src_right - ts->tiling.col_end * 4;
} else if (src_right > border_right) {
src_left -= src_right - border_right;
src_right -= src_right - border_right;
}
// check against top tile boundary and adjust if necessary
if (src_top < border_top) {
......@@ -3001,7 +3002,7 @@ error:
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
dav1d_picture_unref(&f->cur);
dav1d_picture_unref_internal(&f->cur);
dav1d_thread_picture_unref(&f->sr_cur);
dav1d_cdf_thread_unref(&f->in_cdf);
if (f->frame_hdr->refresh_context) {
......@@ -3015,7 +3016,7 @@ error:
dav1d_ref_dec(&f->frame_hdr_ref);
for (int i = 0; i < f->n_tile_data; i++)
dav1d_data_unref(&f->tile[i].data);
dav1d_data_unref_internal(&f->tile[i].data);
return retval;
}
......@@ -3355,7 +3356,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if (c->n_fc == 1) {
const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
if ((res = dav1d_decode_frame(f)) < 0) {
dav1d_picture_unref(&c->out);
dav1d_picture_unref_internal(&c->out);
for (int i = 0; i < 8; i++) {
if (refresh_frame_flags & (1 << i)) {
if (c->refs[i].p.p.data[0])
......@@ -3383,17 +3384,17 @@ error:
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
if (c->n_fc == 1)
dav1d_picture_unref(&c->out);
dav1d_picture_unref_internal(&c->out);
else
dav1d_thread_picture_unref(out_delayed);
dav1d_picture_unref(&f->cur);
dav1d_picture_unref_internal(&f->cur);
dav1d_thread_picture_unref(&f->sr_cur);
dav1d_ref_dec(&f->mvs_ref);
dav1d_ref_dec(&f->seq_hdr_ref);
dav1d_ref_dec(&f->frame_hdr_ref);
for (int i = 0; i < f->n_tile_data; i++)
dav1d_data_unref(&f->tile[i].data);
dav1d_data_unref_internal(&f->tile[i].data);
f->n_tile_data = 0;
if (c->n_fc > 1) {
......
......@@ -198,7 +198,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
if (res < 0) return res;
if (ptr) {
res = dav1d_data_wrap(&buf, ptr, sz, dummy_free, NULL);
res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL);
if (res < 0) goto error;
}
......@@ -220,7 +220,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
res = 0;
error:
dav1d_data_unref(&buf);
dav1d_data_unref_internal(&buf);
dav1d_close(&c);
return res;
......@@ -257,8 +257,8 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
// Apply film grain to a new copy of the image to avoid corrupting refs
int res = dav1d_picture_alloc_copy(out, in->p.w, in);
if (res < 0) {
dav1d_picture_unref(in);
dav1d_picture_unref(out);
dav1d_picture_unref_internal(in);
dav1d_picture_unref_internal(out);
return res;
}
......@@ -278,7 +278,7 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
assert(0);
}
dav1d_picture_unref(in);
dav1d_picture_unref_internal(in);
return 0;
}
......@@ -290,7 +290,7 @@ static int output_picture_ready(Dav1dContext *const c) {
if (c->operating_point_idc && !c->all_layers) {
const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
if (max_spatial_id > c->out.frame_hdr->spatial_id) {
dav1d_picture_unref(&c->out);
dav1d_picture_unref_internal(&c->out);
return 0;
}
}
......@@ -346,12 +346,12 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
while (in->sz > 0) {
res = dav1d_parse_obus(c, in, 0);
if (res < 0) {
dav1d_data_unref(in);
dav1d_data_unref_internal(in);
} else {
assert((size_t)res <= in->sz);
in->sz -= res;
in->data += res;
if (!in->sz) dav1d_data_unref(in);
if (!in->sz) dav1d_data_unref_internal(in);
}
if (output_picture_ready(c))
break;
......@@ -369,7 +369,7 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
}
void dav1d_flush(Dav1dContext *const c) {
dav1d_data_unref(&c->in);
dav1d_data_unref_internal(&c->in);
c->drain = 0;
if (c->n_fc == 1) return;
......@@ -482,7 +482,7 @@ void dav1d_close(Dav1dContext **const c_out) {
dav1d_free_aligned(f->lf.lr_lpf_line);
}
dav1d_free_aligned(c->fc);
dav1d_data_unref(&c->in);
dav1d_data_unref_internal(&c->in);
if (c->n_fc > 1) {
for (unsigned n = 0; n < c->n_fc; n++)
if (c->frame_thread.out_delayed[n].p.data[0])
......@@ -490,7 +490,7 @@ void dav1d_close(Dav1dContext **const c_out) {
free(c->frame_thread.out_delayed);
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref(&c->tile[n].data);
dav1d_data_unref_internal(&c->tile[n].data);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
if (c->refs[n].p.p.data[0])
......@@ -503,3 +503,24 @@ void dav1d_close(Dav1dContext **const c_out) {
dav1d_freep_aligned(c_out);
}
void dav1d_picture_unref(Dav1dPicture *const p) {
dav1d_picture_unref_internal(p);
}
uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
return dav1d_data_create_internal(buf, sz);
}
int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
const size_t sz,
void (*const free_callback)(const uint8_t *data,
void *user_data),
void *const user_data)
{
return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
}
void dav1d_data_unref(Dav1dData *const buf) {
dav1d_data_unref_internal(buf);
}
......@@ -47,7 +47,8 @@ static void backup_lpf(const Dav1dFrameContext *const f,
pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int ss_ver, const int sb128,
int row, const int row_h, const int src_w, const int ss_hor)
int row, const int row_h, const int src_w,
const int h, const int ss_hor)
{
const int dst_w = f->frame_hdr->super_res.enabled ?
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
......@@ -74,18 +75,25 @@ static void backup_lpf(const Dav1dFrameContext *const f,
if (f->frame_hdr->super_res.enabled) {
while (row + stripe_h <= row_h) {
const int n_lines = 4 - (row + stripe_h + 1 == h);
f->dsp->mc.resize(dst, dst_stride, src, src_stride,
dst_w, src_w, 4, f->resize_step[ss_hor],
dst_w, src_w, n_lines, f->resize_step[ss_hor],
f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
row += stripe_h; // unmodified stripe_h for the 1st stripe
stripe_h = 64 >> ss_ver;
src += stripe_h * PXSTRIDE(src_stride);
dst += 4 * PXSTRIDE(dst_stride);
dst += n_lines * PXSTRIDE(dst_stride);
if (n_lines == 3) {
pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
dst += PXSTRIDE(dst_stride);
}
}
} else {
while (row + stripe_h <= row_h) {
const int n_lines = 4 - (row + stripe_h + 1 == h);
for (int i = 0; i < 4; i++) {
pixel_copy(dst, src, src_w);
pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
src, src_w);
dst += PXSTRIDE(dst_stride);
src += PXSTRIDE(src_stride);
}
......@@ -110,20 +118,20 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
if (restore_planes & LR_RESTORE_Y) {
const int h = f->bh << 2;
const int h = f->cur.p.h;
const int w = f->bw << 2;
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 4);
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
0, f->seq_hdr->sb128, y_stripe, row_h, w, 0);
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
}
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h = f->bh << (2 - ss_ver);
const int h = (f->cur.p.h + ss_ver) >> ss_ver;
const int w = f->bw << (2 - ss_hor);
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 4);
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
const ptrdiff_t offset_uv = offset >> ss_ver;
const int y_stripe =
(sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
......@@ -131,12 +139,12 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
if (restore_planes & LR_RESTORE_U) {
backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
}
if (restore_planes & LR_RESTORE_V) {
backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
}
}
}
......
......@@ -1283,7 +1283,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
return res;
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref(&c->tile[n].data);
dav1d_data_unref_internal(&c->tile[n].data);
c->n_tile_data = 0;
c->n_tiles = 0;
if (type != OBU_FRAME) {
......@@ -1333,7 +1333,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
c->tile[c->n_tile_data].start != c->n_tiles)
{
for (int i = 0; i <= c->n_tile_data; i++)
dav1d_data_unref(&c->tile[i].data);
dav1d_data_unref_internal(&c->tile[i].data);
c->n_tile_data = 0;
c->n_tiles = 0;
goto error;
......
......@@ -224,7 +224,7 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
dst->progress = src->progress;
}
void dav1d_picture_unref(Dav1dPicture *const p) {
void dav1d_picture_unref_internal(Dav1dPicture *const p) {
validate_input(p != NULL);
if (p->ref) {
......@@ -237,7 +237,7 @@ void dav1d_picture_unref(Dav1dPicture *const p) {
}
void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
dav1d_picture_unref(&p->p);
dav1d_picture_unref_internal(&p->p);
p->t = NULL;
p->progress = NULL;
......
......@@ -109,5 +109,6 @@ void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
int default_picture_allocator(Dav1dPicture *, void *cookie);
void default_picture_release(Dav1dPicture *, void *cookie);
void dav1d_picture_unref_internal(Dav1dPicture *p);
#endif /* __DAV1D_SRC_PICTURE_H__ */
......@@ -205,8 +205,9 @@ static int decode_coefs(Dav1dTileContext *const t,
// residual and sign
int dc_sign = 1;
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
const int dq_shift = imax(0, t_dim->ctx - 2);
const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
const int cf_min = -(1 << (7 + bitdepth));
......
......@@ -25,6 +25,8 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <stdint.h>
#include "src/x86/cpu.h"
......@@ -47,6 +49,8 @@ unsigned dav1d_get_cpu_flags_x86(void) {
if (info[2] & (1 << 9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
#if ARCH_X86_64
/* We only support >128-bit SIMD on x86-64. */
if (info[2] & (1 << 27)) /* OSXSAVE */ {
uint64_t xcr = dav1d_cpu_xgetbv(0);
if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
......@@ -61,6 +65,7 @@ unsigned dav1d_get_cpu_flags_x86(void) {
}
}
}
#endif
}
return flags;
......