...
 
Commits (10)
  • James Almer's avatar
    obu: set subsampling to 1 for monochrome · e19c7699
    James Almer authored
    e19c7699
  • James Almer's avatar
    obu: fix separate_uv_delta_q for RGB · 18d2d750
    James Almer authored
    18d2d750
  • Henrik Gramner's avatar
    Add ipred_z3 AVX2 asm · a440af4a
    Henrik Gramner authored
    Also backport some minor optimizations to z1.
    a440af4a
  • Henrik Gramner's avatar
    Shrink dav1d_dr_intra_derivative[] · f813285c
    Henrik Gramner authored
    f813285c
  • Henrik Gramner's avatar
    Add minor x86 bilin mc optimizations · f753caea
    Henrik Gramner authored
    f753caea
  • James Almer's avatar
    allocate Tile Group cache dynamically · 46435a53
    James Almer authored
    46435a53
  • Marvin Scholz's avatar
    33ce3829
  • Henrik Gramner's avatar
    Add SGR optimizations · 205b723e
    Henrik Gramner authored
    205b723e
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 4x16 and 16x4 blocks in itx · bf659082
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_4x16_adst_adst_0_8bpc_c: 2203.6
    inv_txfm_add_4x16_adst_adst_0_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_adst_1_8bpc_c: 2235.1
    inv_txfm_add_4x16_adst_adst_1_8bpc_ssse3: 199.7
    inv_txfm_add_4x16_adst_adst_2_8bpc_c: 2199.1
    inv_txfm_add_4x16_adst_adst_2_8bpc_ssse3: 199.9
    inv_txfm_add_4x16_adst_dct_0_8bpc_c: 2272.4
    inv_txfm_add_4x16_adst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_adst_dct_1_8bpc_c: 2281.6
    inv_txfm_add_4x16_adst_dct_1_8bpc_ssse3: 163.7
    inv_txfm_add_4x16_adst_dct_2_8bpc_c: 2262.5
    inv_txfm_add_4x16_adst_dct_2_8bpc_ssse3: 164.7
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_c: 2456.5
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_ssse3: 204.3
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_c: 2349.1
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_ssse3: 198.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_c: 2241.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_c: 1574.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_ssse3: 117.0
    inv_txfm_add_4x16_adst_identity_1_8bpc_c: 1576.3
    inv_txfm_add_4x16_adst_identity_1_8bpc_ssse3: 116.6
    inv_txfm_add_4x16_adst_identity_2_8bpc_c: 1572.9
    inv_txfm_add_4x16_adst_identity_2_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_dct_adst_0_8bpc_c: 2162.8
    inv_txfm_add_4x16_dct_adst_0_8bpc_ssse3: 187.6
    inv_txfm_add_4x16_dct_adst_1_8bpc_c: 2180.4
    inv_txfm_add_4x16_dct_adst_1_8bpc_ssse3: 185.6
    inv_txfm_add_4x16_dct_adst_2_8bpc_c: 2165.1
    inv_txfm_add_4x16_dct_adst_2_8bpc_ssse3: 184.9
    inv_txfm_add_4x16_dct_dct_0_8bpc_c: 2233.7
    inv_txfm_add_4x16_dct_dct_0_8bpc_ssse3: 49.5
    inv_txfm_add_4x16_dct_dct_1_8bpc_c: 2770.4
    inv_txfm_add_4x16_dct_dct_1_8bpc_ssse3: 148.4
    inv_txfm_add_4x16_dct_dct_2_8bpc_c: 2288.7
    inv_txfm_add_4x16_dct_dct_2_8bpc_ssse3: 149.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_c: 2242.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_ssse3: 185.8
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_c: 2249.6
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_c: 2237.3
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_ssse3: 185.1
    inv_txfm_add_4x16_dct_identity_0_8bpc_c: 1532.3
    inv_txfm_add_4x16_dct_identity_0_8bpc_ssse3: 63.7
    inv_txfm_add_4x16_dct_identity_1_8bpc_c: 1534.5
    inv_txfm_add_4x16_dct_identity_1_8bpc_ssse3: 63.6
    inv_txfm_add_4x16_dct_identity_2_8bpc_c: 1548.1
    inv_txfm_add_4x16_dct_identity_2_8bpc_ssse3: 101.6
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_ssse3: 201.6
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_c: 2222.0
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_ssse3: 202.6
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_ssse3: 205.7
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_c: 2294.9
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_c: 2304.2
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_c: 2292.7
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_c: 2281.3
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_ssse3: 202.9
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_c: 2258.7
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_ssse3: 202.4
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_c: 2261.0
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_ssse3: 201.3
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_c: 1580.5
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_ssse3: 116.1
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_c: 1578.7
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_c: 1590.8
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_ssse3: 117.4
    inv_txfm_add_4x16_identity_adst_0_8bpc_c: 1949.0
    inv_txfm_add_4x16_identity_adst_0_8bpc_ssse3: 170.9
    inv_txfm_add_4x16_identity_adst_1_8bpc_c: 1947.4
    inv_txfm_add_4x16_identity_adst_1_8bpc_ssse3: 171.0
    inv_txfm_add_4x16_identity_adst_2_8bpc_c: 1948.7
    inv_txfm_add_4x16_identity_adst_2_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_c: 2022.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_ssse3: 59.2
    inv_txfm_add_4x16_identity_dct_1_8bpc_c: 2020.8
    inv_txfm_add_4x16_identity_dct_1_8bpc_ssse3: 133.7
    inv_txfm_add_4x16_identity_dct_2_8bpc_c: 2020.2
    inv_txfm_add_4x16_identity_dct_2_8bpc_ssse3: 133.2
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_c: 2024.7
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_c: 2021.8
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_ssse3: 170.0
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_c: 2022.5
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_ssse3: 169.9
    inv_txfm_add_4x16_identity_identity_0_8bpc_c: 1328.4
    inv_txfm_add_4x16_identity_identity_0_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_1_8bpc_c: 1330.9
    inv_txfm_add_4x16_identity_identity_1_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_2_8bpc_c: 1327.3
    inv_txfm_add_4x16_identity_identity_2_8bpc_ssse3: 87.6
    inv_txfm_add_16x4_adst_adst_0_8bpc_c: 2166.3
    inv_txfm_add_16x4_adst_adst_0_8bpc_ssse3: 186.3
    inv_txfm_add_16x4_adst_adst_1_8bpc_c: 2166.9
    inv_txfm_add_16x4_adst_adst_1_8bpc_ssse3: 184.9
    inv_txfm_add_16x4_adst_adst_2_8bpc_c: 2167.2
    inv_txfm_add_16x4_adst_adst_2_8bpc_ssse3: 185.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_c: 2123.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_ssse3: 172.1
    inv_txfm_add_16x4_adst_dct_1_8bpc_c: 2124.2
    inv_txfm_add_16x4_adst_dct_1_8bpc_ssse3: 171.2
    inv_txfm_add_16x4_adst_dct_2_8bpc_c: 2122.8
    inv_txfm_add_16x4_adst_dct_2_8bpc_ssse3: 171.8
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_c: 2213.3
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_ssse3: 189.6
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_c: 2227.7
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_c: 2228.5
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_identity_0_8bpc_c: 1906.7
    inv_txfm_add_16x4_adst_identity_0_8bpc_ssse3: 154.3
    inv_txfm_add_16x4_adst_identity_1_8bpc_c: 1905.2
    inv_txfm_add_16x4_adst_identity_1_8bpc_ssse3: 155.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_c: 1905.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_ssse3: 156.3
    inv_txfm_add_16x4_dct_adst_0_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_0_8bpc_ssse3: 37.4
    inv_txfm_add_16x4_dct_adst_1_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_1_8bpc_ssse3: 157.9
    inv_txfm_add_16x4_dct_adst_2_8bpc_c: 2221.1
    inv_txfm_add_16x4_dct_adst_2_8bpc_ssse3: 158.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_c: 2177.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_ssse3: 29.6
    inv_txfm_add_16x4_dct_dct_1_8bpc_c: 2179.3
    inv_txfm_add_16x4_dct_dct_1_8bpc_ssse3: 144.9
    inv_txfm_add_16x4_dct_dct_2_8bpc_c: 2177.8
    inv_txfm_add_16x4_dct_dct_2_8bpc_ssse3: 143.7
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_c: 2293.6
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_ssse3: 38.3
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_c: 2293.2
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_ssse3: 163.9
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_c: 2301.3
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_ssse3: 163.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_c: 1977.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_ssse3: 39.9
    inv_txfm_add_16x4_dct_identity_1_8bpc_c: 1978.7
    inv_txfm_add_16x4_dct_identity_1_8bpc_ssse3: 126.8
    inv_txfm_add_16x4_dct_identity_2_8bpc_c: 1979.5
    inv_txfm_add_16x4_dct_identity_2_8bpc_ssse3: 128.1
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_c: 2175.6
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_ssse3: 185.1
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_c: 2175.7
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_ssse3: 185.7
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_c: 2173.1
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_ssse3: 185.0
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_c: 2140.5
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_c: 2147.5
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_ssse3: 171.9
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_c: 2148.5
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_c: 2240.6
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_ssse3: 191.3
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_c: 2243.5
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_ssse3: 193.2
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_c: 2242.9
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_ssse3: 192.0
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_c: 1919.2
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_ssse3: 155.1
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_c: 1925.2
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_ssse3: 155.2
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_c: 2084.8
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_ssse3: 155.0
    inv_txfm_add_16x4_identity_adst_0_8bpc_c: 1498.5
    inv_txfm_add_16x4_identity_adst_0_8bpc_ssse3: 107.6
    inv_txfm_add_16x4_identity_adst_1_8bpc_c: 1499.5
    inv_txfm_add_16x4_identity_adst_1_8bpc_ssse3: 107.0
    inv_txfm_add_16x4_identity_adst_2_8bpc_c: 1498.9
    inv_txfm_add_16x4_identity_adst_2_8bpc_ssse3: 107.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_c: 1471.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_ssse3: 45.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_c: 1476.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_ssse3: 45.5
    inv_txfm_add_16x4_identity_dct_2_8bpc_c: 1459.8
    inv_txfm_add_16x4_identity_dct_2_8bpc_ssse3: 92.3
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_c: 1548.7
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_ssse3: 112.1
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_c: 1548.2
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_ssse3: 111.7
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_c: 1547.2
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_ssse3: 114.1
    inv_txfm_add_16x4_identity_identity_0_8bpc_c: 1271.5
    inv_txfm_add_16x4_identity_identity_0_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_1_8bpc_c: 1266.8
    inv_txfm_add_16x4_identity_identity_1_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_2_8bpc_c: 1268.0
    inv_txfm_add_16x4_identity_identity_2_8bpc_ssse3: 74.6
    bf659082
  • Ronald S. Bultje's avatar
    Don't filter top/left intra edge if intra_edge_filter=0 · 0a8df458
    Ronald S. Bultje authored
    Fixes #236.
    0a8df458
......@@ -70,7 +70,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
2. Run `meson build --buildtype release`
3. Build with `ninja -C build`
......
......@@ -311,8 +311,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
nasm_r = run_command(nasm, '-v')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13')
error('nasm 2.13 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
......
......@@ -3172,6 +3172,12 @@ int dav1d_submit_frame(Dav1dContext *const c) {
}
// FIXME qsort so tiles are in order (for frame threading)
if (f->n_tile_data_alloc < c->n_tile_data) {
struct Dav1dTileGroup *tile = realloc(f->tile, c->n_tile_data * sizeof(*f->tile));
if (!tile) goto error;
f->tile = tile;
f->n_tile_data_alloc = c->n_tile_data;
}
memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
f->n_tile_data = c->n_tile_data;
......
......@@ -65,16 +65,19 @@ typedef struct Dav1dDSPContext {
Dav1dLoopRestorationDSPContext lr;
} Dav1dDSPContext;
struct Dav1dTileGroup {
Dav1dData data;
int start, end;
};
struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
int n_tiles;
Dav1dRef *seq_hdr_ref;
......@@ -139,10 +142,8 @@ struct Dav1dFrameContext {
unsigned refpoc[7], refrefpoc[7][7];
uint8_t gmv_warp_allowed[7];
CdfThreadContext in_cdf, out_cdf;
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
// for scalable references
......
......@@ -81,8 +81,8 @@ enum IntraPredMode
const pixel *dst, ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *angle,
int tw, int th, pixel *topleft_out
HIGHBD_DECL_SUFFIX);
int tw, int th, int filter_edge,
pixel *topleft_out HIGHBD_DECL_SUFFIX);
// These flags are OR'd with the angle argument into intra predictors.
// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
......
......@@ -82,7 +82,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
const ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *const angle,
const int tw, const int th,
const int tw, const int th, const int filter_edge,
pixel *const topleft_out HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
......@@ -201,7 +201,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
} else {
*topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
}
if (mode == Z2_PRED && tw + th >= 6)
if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
*topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
topleft_out[1] * 5 + 8) >> 4;
}
......
......@@ -422,7 +422,7 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle < 90);
int dx = dav1d_dr_intra_derivative[angle];
int dx = dav1d_dr_intra_derivative[angle >> 1];
pixel top_out[(64 + 64) * 2];
const pixel *top;
int max_base_x;
......@@ -476,8 +476,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 90 && angle < 180);
int dy = dav1d_dr_intra_derivative[angle - 90];
int dx = dav1d_dr_intra_derivative[180 - angle];
int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
const int upsample_left = enable_intra_edge_filter ?
get_upsample(width + height, 180 - angle, is_sm) : 0;
const int upsample_above = enable_intra_edge_filter ?
......@@ -557,7 +557,7 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 180);
int dy = dav1d_dr_intra_derivative[270 - angle];
int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
pixel left_out[(64 + 64) * 2];
const pixel *left;
int max_base_y;
......
......@@ -473,6 +473,7 @@ void dav1d_close(Dav1dContext **const c_out) {
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
free(f->tile);
free(f->lf.mask);
free(f->lf.lr_mask);
free(f->lf.level);
......@@ -491,6 +492,7 @@ void dav1d_close(Dav1dContext **const c_out) {
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref_internal(&c->tile[n].data);
free(c->tile);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
if (c->refs[n].p.p.data[0])
......
......@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src,
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = x;
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
}
AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE;
......
......@@ -221,7 +221,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
if (hdr->monochrome) {
hdr->color_range = dav1d_get_bits(gb, 1);
hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
hdr->ss_hor = hdr->ss_ver = 0;
hdr->ss_hor = hdr->ss_ver = 1;
hdr->chr = DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = 0;
} else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
......@@ -258,8 +258,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
}
hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
}
hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-colorinfo: off=%ld\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
......@@ -1311,7 +1311,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
case OBU_TILE_GRP: {
if (global) break;
if (!c->frame_hdr) goto error;
if (c->n_tile_data >= 256) goto error;
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
if (!tile) goto error;
c->tile = tile;
memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
c->n_tile_data_alloc = c->n_tile_data + 1;
}
parse_tile_hdr(c, &gb);
// Align to the next byte boundary and check for overrun.
dav1d_bytealign_get_bits(&gb);
......
......@@ -833,8 +833,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst,
f->cur.stride[0], top_sb_edge,
b->y_mode, &angle,
t_dim->w, t_dim->h, edge
HIGHBD_CALL_SUFFIX);
t_dim->w, t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
t_dim->w * 4, t_dim->h * 4,
angle | intra_flags,
......@@ -951,9 +952,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uv_dst[pl], stride,
top_sb_edge, DC_PRED, &angle,
uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->w, uv_t_dim->h, 0,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
uv_t_dim->w * 4,
uv_t_dim->h * 4,
......@@ -1053,8 +1053,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst, stride,
top_sb_edge, uv_mode,
&angle, uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
angle |= intra_edge_filter_flag;
dsp->ipred.intra_pred[m](dst, stride, edge,
uv_t_dim->w * 4,
......@@ -1216,7 +1217,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
t->by, t->by > ts->tiling.row_start,
ts->tiling.col_end, ts->tiling.row_end,
0, dst, f->cur.stride[0], top_sb_edge,
m, &angle, bw4, bh4, tl_edge
m, &angle, bw4, bh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
......@@ -1358,7 +1359,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uvdst, f->cur.stride[1],
top_sb_edge, m,
&angle, cbw4, cbh4, tl_edge
&angle, cbw4, cbh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
......
......@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 },
};
const int dav1d_sgr_x_by_xplus1[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
256,
const uint8_t dav1d_sgr_x_by_x[256] = {
255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0
};
const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
......@@ -775,37 +775,36 @@ const uint8_t dav1d_sm_weights[128] = {
7, 6, 6, 5, 5, 4, 4, 4
};
const int16_t dav1d_dr_intra_derivative[90] = {
// More evenly spread out angles and limited to 10-bit
const uint16_t dav1d_dr_intra_derivative[44] = {
// Values that are 0 will never be used
0, 0, 0, // Approx angle
1023, 0, 0, // 3, ...
547, 0, 0, // 6, ...
372, 0, 0, 0, 0, // 9, ...
273, 0, 0, // 14, ...
215, 0, 0, // 17, ...
178, 0, 0, // 20, ...
151, 0, 0, // 23, ... (113 & 203 are base angles)
132, 0, 0, // 26, ...
116, 0, 0, // 29, ...
102, 0, 0, 0, // 32, ...
90, 0, 0, // 36, ...
80, 0, 0, // 39, ...
71, 0, 0, // 42, ...
64, 0, 0, // 45, ... (45 & 135 are base angles)
57, 0, 0, // 48, ...
51, 0, 0, // 51, ...
45, 0, 0, 0, // 54, ...
40, 0, 0, // 58, ...
35, 0, 0, // 61, ...
31, 0, 0, // 64, ...
27, 0, 0, // 67, ... (67 & 157 are base angles)
23, 0, 0, // 70, ...
19, 0, 0, // 73, ...
15, 0, 0, 0, 0, // 76, ...
11, 0, 0, // 81, ...
7, 0, 0, // 84, ...
3, 0, 0, // 87, ...
0, // Angles:
1023, 0, // 3, 93, 183
547, // 6, 96, 186
372, 0, 0, // 9, 99, 189
273, // 14, 104, 194
215, 0, // 17, 107, 197
178, // 20, 110, 200
151, 0, // 23, 113, 203 (113 & 203 are base angles)
132, // 26, 116, 206
116, 0, // 29, 119, 209
102, 0, // 32, 122, 212
90, // 36, 126, 216
80, 0, // 39, 129, 219
71, // 42, 132, 222
64, 0, // 45, 135, 225 (45 & 135 are base angles)
57, // 48, 138, 228
51, 0, // 51, 141, 231
45, 0, // 54, 144, 234
40, // 58, 148, 238
35, 0, // 61, 151, 241
31, // 64, 154, 244
27, 0, // 67, 157, 247 (67 & 157 are base angles)
23, // 70, 160, 250
19, 0, // 73, 163, 253
15, 0, // 76, 166, 256
11, 0, // 81, 171, 261
7, // 84, 174, 264
3 // 87, 177, 267
};
const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
......
......@@ -107,14 +107,14 @@ static const unsigned interintra_allowed_mask =
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int16_t dav1d_sgr_params[16][4];
extern const int dav1d_sgr_x_by_xplus1[256];
extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
extern const int16_t dav1d_resize_filter[64][8];
extern const uint8_t dav1d_sm_weights[128];
extern const int16_t dav1d_dr_intra_derivative[90];
extern const uint16_t dav1d_dr_intra_derivative[44];
extern const int8_t dav1d_filter_intra_taps[5][64];
extern const uint8_t dav1d_obmc_masks[64];
......
This diff is collapsed.
......@@ -39,6 +39,7 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
......@@ -86,6 +87,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2;
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;
c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2;
......
......@@ -77,10 +77,12 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
decl_itx17_fns(4, 4, ssse3);
decl_itx16_fns(4, 8, ssse3);
decl_itx16_fns(8, 4, ssse3);
decl_itx16_fns(8, 8, ssse3);
decl_itx17_fns( 4, 4, ssse3);
decl_itx16_fns( 4, 8, ssse3);
decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
......@@ -124,10 +126,12 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
assign_itx16_fn(, 8, 8, ssse3);
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
This diff is collapsed.
......@@ -42,14 +42,12 @@ pw_2048: times 2 dw 2048
pw_16380: times 2 dw 16380
pw_0_128: dw 0, 128
pw_5_6: dw 5, 6
pw_82: times 2 dw 82
pw_91_5: dw 91, 5
pd_6: dd 6
pd_255: dd 255
pd_1024: dd 1024
pd_0x80000: dd 0x80000
pd_0xf0080029: dd 0xf0080029
pd_0xf00801c7: dd 0xf00801c7
cextern sgr_x_by_xplus1
cextern sgr_x_by_x
SECTION .text
......@@ -477,76 +475,65 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
RET
INIT_YMM avx2
cglobal sgr_calc_ab1, 4, 6, 14, a, b, w, h, s
cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
add hd, 2
lea r5, [sgr_x_by_xplus1]
pxor m6, m6
vpbroadcastd m7, [pw_91_5]
lea r5, [sgr_x_by_x-0xf03]
%ifidn sd, sm
movd xm8, sd
vpbroadcastd m8, xm8
movd xm6, sd
vpbroadcastd m6, xm6
%else
vpbroadcastd m8, sm
vpbroadcastd m6, sm
%endif
vpbroadcastd m9, [pd_0x80000]
vpbroadcastd m10, [pd_255]
psrad m12, m9, 8 ; pd_2048
psrad m11, m9, 11 ; pd_256
pcmpeqb m13, m13
vpbroadcastd m8, [pd_0xf00801c7]
vpbroadcastd m9, [pw_256]
pcmpeqb m7, m7
psrld m10, m9, 13 ; pd_2048
DEFINE_ARGS a, b, w, h, x
.loop_y:
mov xq, -2
.loop_x:
movu xm0, [aq+xq*4+ 0]
movu xm1, [aq+xq*4+16]
vinserti128 m0, [aq+xq*4+ 0+(384+16)*4], 1
vinserti128 m1, [aq+xq*4+16+(384+16)*4], 1
movu xm2, [bq+xq*2]
vinserti128 m2, [bq+xq*2+(384+16)*2], 1
pslld m3, m0, 3
pslld m4, m1, 3
paddd m3, m0 ; aa * 9 [first half]
paddd m4, m1 ; aa * 9 [second half]
punpcklwd m0, m6, m2
punpckhwd m2, m6, m2
pmaddwd m1, m0, m0
pmaddwd m5, m2, m2
pmaddwd m0, m7
pmaddwd m2, m7
psubd m3, m1 ; p = aa * 9 - bb * bb [first half]
psubd m4, m5 ; p = aa * 9 - bb * bb [second half]
pmulld m3, m8
pmulld m4, m8
paddd m3, m9
paddd m4, m9
psrld m3, 20 ; z [first half]
psrld m4, 20 ; z [second half]
pminsd m3, m10
pminsd m4, m10
mova m5, m13
vpgatherdd m1, [r5+m3*4], m5 ; xx [first half]
mova m5, m13
vpgatherdd m3, [r5+m4*4], m5 ; xx [second half]
psubd m5, m11, m1
psubd m4, m11, m3
packssdw m1, m3
pmullw m5, m7
pmullw m4, m7
pmaddwd m5, m0
pmaddwd m4, m2
paddd m5, m12
paddd m4, m12
psrad m5, 12
psrad m4, 12
movu [bq+xq*2], xm1
vextracti128 [bq+xq*2+(384+16)*2], m1, 1
movu [aq+xq*4+ 0], xm5
movu [aq+xq*4+16], xm4
vextracti128 [aq+xq*4+ 0+(384+16)*4], m5, 1
vextracti128 [aq+xq*4+16+(384+16)*4], m4, 1
pmovzxwd m0, [bq+xq*2]
pmovzxwd m1, [bq+xq*2+(384+16)*2]
movu m2, [aq+xq*4]
movu m3, [aq+xq*4+(384+16)*4]
pslld m4, m2, 3
pslld m5, m3, 3
paddd m2, m4 ; aa * 9
paddd m3, m5
pmaddwd m4, m0, m0
pmaddwd m5, m1, m1
pmaddwd m0, m8
pmaddwd m1, m8
psubd m2, m4 ; p = aa * 9 - bb * bb
psubd m3, m5
pmulld m2, m6
pmulld m3, m6
paddusw m2, m8
paddusw m3, m8
psrld m2, 20 ; z
psrld m3, 20
mova m5, m7
vpgatherdd m4, [r5+m2], m5 ; xx
mova m5, m7
vpgatherdd m2, [r5+m3], m5
psrld m4, 24
psrld m2, 24
pmulld m0, m4
pmulld m1, m2
packssdw m4, m2
psubw m4, m9, m4
vpermq m4, m4, q3120
paddd m0, m10
paddd m1, m10
psrld m0, 12
psrld m1, 12
movu [bq+xq*2], xm4
vextracti128 [bq+xq*2+(384+16)*2], m4, 1
movu [aq+xq*4], m0
movu [aq+xq*4+(384+16)*4], m1
add xd, 8
cmp xd, wd
jl .loop_x
......@@ -903,78 +890,67 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
jmp .loop_y_noload
INIT_YMM avx2
cglobal sgr_calc_ab2, 4, 6, 14, a, b, w, h, s
cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
add hd, 2
lea r5, [sgr_x_by_xplus1]
pxor m6, m6
vpbroadcastd m7, [pw_82]
lea r5, [sgr_x_by_x-0xf03]
%ifidn sd, sm
movd xm8, sd
vpbroadcastd m8, xm8
movd xm6, sd
vpbroadcastd m6, xm6
%else
vpbroadcastd m8, sm
vpbroadcastd m6, sm
%endif
vpbroadcastd m9, [pd_0x80000]
vpbroadcastd m10, [pd_255]
psrad m12, m9, 8 ; pd_2048
psrad m11, m9, 11 ; pd_256
pcmpeqb m13, m13
vpbroadcastd m8, [pd_0xf0080029]
vpbroadcastd m9, [pw_256]
pcmpeqb m7, m7
psrld m10, m9, 15 ; pd_512
DEFINE_ARGS a, b, w, h, x
.loop_y:
mov xq, -2
.loop_x:
movu xm0, [aq+xq*4+ 0]
movu xm1, [aq+xq*4+16]
vinserti128 m0, [aq+xq*4+32], 1
vinserti128 m1, [aq+xq*4+48], 1
movu m2, [bq+xq*2]
pslld m3, m0, 5 ; aa * 32 [first half]
pslld m4, m1, 5 ; aa * 32 [second half]
paddd m3, m0 ; aa * 33 [first half]
paddd m4, m1 ; aa * 33 [first half]
pslld m0, 3 ; aa * 8 [first half]
pslld m1, 3 ; aa * 8 [second half]
psubd m3, m0 ; aa * 25 [first half]
psubd m4, m1 ; aa * 25 [second half]
punpcklwd m0, m2, m6
punpckhwd m2, m6
pmaddwd m1, m0, m0
pmaddwd m5, m2, m2
paddw m0, m0
paddw m2, m2
psubd m3, m1 ; p = aa * 25 - bb * bb [first half]
psubd m4, m5 ; p = aa * 25 - bb * bb [second half]
pmulld m3, m8
pmulld m4, m8
paddd m3, m9
paddd m4, m9
psrld m3, 20 ; z [first half]
psrld m4, 20 ; z [second half]
pminsd m3, m10
pminsd m4, m10
mova m5, m13
vpgatherdd m1, [r5+m3*4], m5 ; xx [first half]
mova m5, m13
vpgatherdd m3, [r5+m4*4], m5 ; xx [second half]
psubd m5, m11, m1
psubd m4, m11, m3
packssdw m1, m3
pmullw m5, m7
pmullw m4, m7
pmaddwd m5, m0
pmaddwd m4, m2
paddd m5, m12
paddd m4, m12
psrad m5, 12
psrad m4, 12
movu [bq+xq*2], m1
movu [aq+xq*4+ 0], xm5
movu [aq+xq*4+16], xm4
vextracti128 [aq+xq*4+32], m5, 1
vextracti128 [aq+xq*4+48], m4, 1
pmovzxwd m0, [bq+xq*2+ 0]
pmovzxwd m1, [bq+xq*2+16]
movu m2, [aq+xq*4+ 0]
movu m3, [aq+xq*4+32]
pslld m4, m2, 3 ; aa * 8
pslld m5, m3, 3
paddd m2, m4 ; aa * 9
paddd m3, m5
paddd m4, m4 ; aa * 16
paddd m5, m5
paddd m2, m4 ; aa * 25
paddd m3, m5
pmaddwd m4, m0, m0
pmaddwd m5, m1, m1
psubd m2, m4 ; p = aa * 25 - bb * bb
psubd m3, m5
pmulld m2, m6
pmulld m3, m6
paddusw m2, m8
paddusw m3, m8
psrld m2, 20 ; z
psrld m3, 20
mova m5, m7
vpgatherdd m4, [r5+m2], m5 ; xx
mova m5, m7
vpgatherdd m2, [r5+m3], m5
psrld m4, 24
psrld m2, 24
packssdw m3, m4, m2
pmullw m4, m8
pmullw m2, m8
psubw m3, m9, m3
vpermq m3, m3, q3120
pmaddwd m0, m4
pmaddwd m1, m2
paddd m0, m10
paddd m1, m10
psrld m0, 10
psrld m1, 10
movu [bq+xq*2], m3
movu [aq+xq*4+ 0], m0
movu [aq+xq*4+32], m1
add xd, 16
cmp xd, wd
jl .loop_x
......
This diff is collapsed.
This diff is collapsed.