...
 
Commits (9)
......@@ -88,6 +88,40 @@ build-win64:
- build/dav1d_install/
expire_in: 1 week
build-win-arm32:
image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190117110230
stage: build
tags:
- win32
script:
- meson build --buildtype release
--werror
--libdir lib
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/armv7-w64-mingw32.meson
-Ddefault_library=both
- ninja -C build
build-win-arm64:
image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190117110230
stage: build
tags:
- win64
script:
- meson build --buildtype release
--werror
--libdir lib
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
-Ddefault_library=both
- ninja -C build
- ninja -C build install
artifacts:
name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
paths:
- build/dav1d_install/
expire_in: 1 week
build-debian-aarch64:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
......
......@@ -546,58 +546,61 @@ endfunc
mla \d\wd, \s2\wd, v0.h[2]
mla \d\wd, \s3\wd, v0.h[3]
.endm
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d1\().8h, \s9\().8h, v0.h[7]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d1\().8h, \s11\().8h, v0.h[7]
.endm
.macro sqrshrun_b shift, r0, r1, r2, r3
......@@ -628,7 +631,7 @@ endfunc
st1 {\reg\().h}[3], [x8], \strd
.endif
.endm
.macro st_s strd, r0, r1, r2, r3
.macro st_s strd, r0, r1
st1 {\r0\().s}[0], [x0], \strd
st1 {\r0\().s}[1], [x8], \strd
.ifnb \r1
......@@ -636,7 +639,7 @@ endfunc
st1 {\r1\().s}[1], [x8], \strd
.endif
.endm
.macro st_d strd, r0, r1, r2, r3
.macro st_d strd, r0, r1
st1 {\r0\().d}[0], [x0], \strd
st1 {\r0\().d}[1], [x8], \strd
.ifnb \r1
......@@ -644,13 +647,13 @@ endfunc
st1 {\r1\().d}[1], [x8], \strd
.endif
.endm
.macro shift_store_4 type, strd, r0, r1, r2, r3
.macro shift_store_4 type, strd, r0, r1
.ifc \type, put
sqrshrun_b 6, \r0, \r1, \r2, \r3
st_s \strd, \r0, \r1, \r2, \r3
sqrshrun_b 6, \r0, \r1
st_s \strd, \r0, \r1
.else
srshr_h 2, \r0, \r1, \r2, \r3
st_d \strd, \r0, \r1, \r2, \r3
srshr_h 2, \r0, \r1
st_d \strd, \r0, \r1
.endif
.endm
.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
......@@ -742,7 +745,7 @@ function \type\()_8tap
L(\type\()_8tap_h):
cmp \w, #4
ubfm w9, \mx, #7, #13
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
b.le 4f
mov \mx, w9
......@@ -965,7 +968,7 @@ L(\type\()_8tap_h_tbl):
L(\type\()_8tap_v):
cmp \h, #4
ubfm w9, \my, #7, #13
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
b.le 4f
mov \my, w9
......@@ -1216,7 +1219,7 @@ L(\type\()_8tap_v):
160:
b.gt 1680b
// 16x4 v
// 16x2, 16x4 v
add \xmy, \xmy, #2
ld1 {v0.s}[0], [\xmy]
sub \src, \src, \s_strd
......@@ -1269,7 +1272,7 @@ L(\type\()_8tap_v_tbl):
L(\type\()_8tap_hv):
cmp \h, #4
ubfm w9, \my, #7, #13
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
b.le 4f
mov \my, w9
......@@ -1304,21 +1307,19 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
addv h28, v28.4h
addv h29, v29.4h
trn1 v16.4h, v28.4h, v29.4h
srshr v16.4h, v16.4h, #2
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
trn1 v17.2s, v28.2s, v30.2s
mov v18.8b, v30.8b
mov v17.8b, v28.8b
2:
bl L(\type\()_8tap_filter_2)
trn1 v18.2s, v18.2s, v28.2s
trn1 v19.2s, v28.2s, v30.2s
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
......@@ -1332,7 +1333,6 @@ L(\type\()_8tap_hv):
b.le 0f
mov v16.8b, v18.8b
mov v17.8b, v19.8b
mov v18.8b, v30.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
......@@ -1352,28 +1352,24 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
addv h28, v28.4h
addv h29, v29.4h
trn1 v16.4h, v28.4h, v29.4h
srshr v16.4h, v16.4h, #2
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
trn1 v17.2s, v28.2s, v30.2s
mov v18.8b, v30.8b
mov v17.8b, v28.8b
bl L(\type\()_8tap_filter_2)
trn1 v18.2s, v18.2s, v28.2s
trn1 v19.2s, v28.2s, v30.2s
mov v20.8b, v30.8b
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
bl L(\type\()_8tap_filter_2)
trn1 v20.2s, v20.2s, v28.2s
trn1 v21.2s, v28.2s, v30.2s
mov v22.8b, v30.8b
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
bl L(\type\()_8tap_filter_2)
trn1 v22.2s, v22.2s, v28.2s
trn1 v23.2s, v28.2s, v30.2s
ext v22.8b, v21.8b, v28.8b, #4
mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
......@@ -1395,7 +1391,6 @@ L(\type\()_8tap_hv):
mov v19.8b, v21.8b
mov v20.8b, v22.8b
mov v21.8b, v23.8b
mov v22.8b, v30.8b
b 28b
0:
......@@ -1417,7 +1412,6 @@ L(\type\()_8tap_filter_2):
mla v27.4h, v30.4h, v0.h[2]
mla v27.4h, v31.4h, v0.h[3]
srshr v28.4h, v27.4h, #2
trn2 v30.2s, v28.2s, v28.2s
ret
.endif
......@@ -1453,14 +1447,17 @@ L(\type\()_8tap_filter_2):
mov v18.8b, v29.8b
4:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v28.4h, v1.h[2]
smlal v2.4s, v28.4h, v1.h[3]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v28.4h, v1.h[2]
smlal v3.4s, v29.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......@@ -1514,22 +1511,22 @@ L(\type\()_8tap_filter_2):
mov v22.8b, v29.8b
48:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v19.4h, v1.h[2]
smlal v2.4s, v19.4h, v1.h[3]
smlal v3.4s, v20.4h, v1.h[3]
smlal v2.4s, v20.4h, v1.h[4]
smlal v3.4s, v21.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v3.4s, v28.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v19.4h, v1.h[2]
smlal v3.4s, v20.4h, v1.h[3]
smlal v3.4s, v21.4h, v1.h[4]
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......
......@@ -114,7 +114,9 @@ EXTERN\name:
#endif
.purgem endconst
.endm
#if !defined(__MACH__)
#if defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
......
......@@ -81,8 +81,8 @@ enum IntraPredMode
const pixel *dst, ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *angle,
int tw, int th, pixel *topleft_out
HIGHBD_DECL_SUFFIX);
int tw, int th, int filter_edge,
pixel *topleft_out HIGHBD_DECL_SUFFIX);
// These flags are OR'd with the angle argument into intra predictors.
// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
......
......@@ -82,7 +82,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
const ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *const angle,
const int tw, const int th,
const int tw, const int th, const int filter_edge,
pixel *const topleft_out HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
......@@ -201,7 +201,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
} else {
*topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
}
if (mode == Z2_PRED && tw + th >= 6)
if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
*topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
topleft_out[1] * 5 + 8) >> 4;
}
......
......@@ -833,8 +833,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst,
f->cur.stride[0], top_sb_edge,
b->y_mode, &angle,
t_dim->w, t_dim->h, edge
HIGHBD_CALL_SUFFIX);
t_dim->w, t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
t_dim->w * 4, t_dim->h * 4,
angle | intra_flags,
......@@ -951,9 +952,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uv_dst[pl], stride,
top_sb_edge, DC_PRED, &angle,
uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->w, uv_t_dim->h, 0,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
uv_t_dim->w * 4,
uv_t_dim->h * 4,
......@@ -1053,8 +1053,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst, stride,
top_sb_edge, uv_mode,
&angle, uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
angle |= intra_edge_filter_flag;
dsp->ipred.intra_pred[m](dst, stride, edge,
uv_t_dim->w * 4,
......@@ -1216,7 +1217,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
t->by, t->by > ts->tiling.row_start,
ts->tiling.col_end, ts->tiling.row_end,
0, dst, f->cur.stride[0], top_sb_edge,
m, &angle, bw4, bh4, tl_edge
m, &angle, bw4, bh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
......@@ -1358,7 +1359,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uvdst, f->cur.stride[1],
top_sb_edge, m,
&angle, cbw4, cbh4, tl_edge
&angle, cbw4, cbh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
......