...
 
Commits (10)
  • James Almer's avatar
    obu: set subsampling to 1 for monochrome · e19c7699
    James Almer authored
    e19c7699
  • James Almer's avatar
    obu: fix separate_uv_delta_q for RGB · 18d2d750
    James Almer authored
    18d2d750
  • Henrik Gramner's avatar
    Add ipred_z3 AVX2 asm · a440af4a
    Henrik Gramner authored
    Also backport some minor optimizations to z1.
    a440af4a
  • Henrik Gramner's avatar
    Shrink dav1d_dr_intra_derivative[] · f813285c
    Henrik Gramner authored
    f813285c
  • Henrik Gramner's avatar
    Add minor x86 bilin mc optimizations · f753caea
    Henrik Gramner authored
    f753caea
  • James Almer's avatar
    allocate Tile Group cache dynamically · 46435a53
    James Almer authored
    46435a53
  • Marvin Scholz's avatar
    33ce3829
  • Henrik Gramner's avatar
    Add SGR optimizations · 205b723e
    Henrik Gramner authored
    205b723e
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 4x16 and 16x4 blocks in itx · bf659082
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_4x16_adst_adst_0_8bpc_c: 2203.6
    inv_txfm_add_4x16_adst_adst_0_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_adst_1_8bpc_c: 2235.1
    inv_txfm_add_4x16_adst_adst_1_8bpc_ssse3: 199.7
    inv_txfm_add_4x16_adst_adst_2_8bpc_c: 2199.1
    inv_txfm_add_4x16_adst_adst_2_8bpc_ssse3: 199.9
    inv_txfm_add_4x16_adst_dct_0_8bpc_c: 2272.4
    inv_txfm_add_4x16_adst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_adst_dct_1_8bpc_c: 2281.6
    inv_txfm_add_4x16_adst_dct_1_8bpc_ssse3: 163.7
    inv_txfm_add_4x16_adst_dct_2_8bpc_c: 2262.5
    inv_txfm_add_4x16_adst_dct_2_8bpc_ssse3: 164.7
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_c: 2456.5
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_ssse3: 204.3
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_c: 2349.1
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_ssse3: 198.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_c: 2241.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_c: 1574.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_ssse3: 117.0
    inv_txfm_add_4x16_adst_identity_1_8bpc_c: 1576.3
    inv_txfm_add_4x16_adst_identity_1_8bpc_ssse3: 116.6
    inv_txfm_add_4x16_adst_identity_2_8bpc_c: 1572.9
    inv_txfm_add_4x16_adst_identity_2_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_dct_adst_0_8bpc_c: 2162.8
    inv_txfm_add_4x16_dct_adst_0_8bpc_ssse3: 187.6
    inv_txfm_add_4x16_dct_adst_1_8bpc_c: 2180.4
    inv_txfm_add_4x16_dct_adst_1_8bpc_ssse3: 185.6
    inv_txfm_add_4x16_dct_adst_2_8bpc_c: 2165.1
    inv_txfm_add_4x16_dct_adst_2_8bpc_ssse3: 184.9
    inv_txfm_add_4x16_dct_dct_0_8bpc_c: 2233.7
    inv_txfm_add_4x16_dct_dct_0_8bpc_ssse3: 49.5
    inv_txfm_add_4x16_dct_dct_1_8bpc_c: 2770.4
    inv_txfm_add_4x16_dct_dct_1_8bpc_ssse3: 148.4
    inv_txfm_add_4x16_dct_dct_2_8bpc_c: 2288.7
    inv_txfm_add_4x16_dct_dct_2_8bpc_ssse3: 149.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_c: 2242.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_ssse3: 185.8
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_c: 2249.6
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_c: 2237.3
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_ssse3: 185.1
    inv_txfm_add_4x16_dct_identity_0_8bpc_c: 1532.3
    inv_txfm_add_4x16_dct_identity_0_8bpc_ssse3: 63.7
    inv_txfm_add_4x16_dct_identity_1_8bpc_c: 1534.5
    inv_txfm_add_4x16_dct_identity_1_8bpc_ssse3: 63.6
    inv_txfm_add_4x16_dct_identity_2_8bpc_c: 1548.1
    inv_txfm_add_4x16_dct_identity_2_8bpc_ssse3: 101.6
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_ssse3: 201.6
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_c: 2222.0
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_ssse3: 202.6
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_ssse3: 205.7
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_c: 2294.9
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_c: 2304.2
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_c: 2292.7
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_c: 2281.3
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_ssse3: 202.9
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_c: 2258.7
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_ssse3: 202.4
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_c: 2261.0
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_ssse3: 201.3
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_c: 1580.5
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_ssse3: 116.1
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_c: 1578.7
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_c: 1590.8
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_ssse3: 117.4
    inv_txfm_add_4x16_identity_adst_0_8bpc_c: 1949.0
    inv_txfm_add_4x16_identity_adst_0_8bpc_ssse3: 170.9
    inv_txfm_add_4x16_identity_adst_1_8bpc_c: 1947.4
    inv_txfm_add_4x16_identity_adst_1_8bpc_ssse3: 171.0
    inv_txfm_add_4x16_identity_adst_2_8bpc_c: 1948.7
    inv_txfm_add_4x16_identity_adst_2_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_c: 2022.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_ssse3: 59.2
    inv_txfm_add_4x16_identity_dct_1_8bpc_c: 2020.8
    inv_txfm_add_4x16_identity_dct_1_8bpc_ssse3: 133.7
    inv_txfm_add_4x16_identity_dct_2_8bpc_c: 2020.2
    inv_txfm_add_4x16_identity_dct_2_8bpc_ssse3: 133.2
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_c: 2024.7
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_c: 2021.8
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_ssse3: 170.0
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_c: 2022.5
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_ssse3: 169.9
    inv_txfm_add_4x16_identity_identity_0_8bpc_c: 1328.4
    inv_txfm_add_4x16_identity_identity_0_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_1_8bpc_c: 1330.9
    inv_txfm_add_4x16_identity_identity_1_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_2_8bpc_c: 1327.3
    inv_txfm_add_4x16_identity_identity_2_8bpc_ssse3: 87.6
    inv_txfm_add_16x4_adst_adst_0_8bpc_c: 2166.3
    inv_txfm_add_16x4_adst_adst_0_8bpc_ssse3: 186.3
    inv_txfm_add_16x4_adst_adst_1_8bpc_c: 2166.9
    inv_txfm_add_16x4_adst_adst_1_8bpc_ssse3: 184.9
    inv_txfm_add_16x4_adst_adst_2_8bpc_c: 2167.2
    inv_txfm_add_16x4_adst_adst_2_8bpc_ssse3: 185.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_c: 2123.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_ssse3: 172.1
    inv_txfm_add_16x4_adst_dct_1_8bpc_c: 2124.2
    inv_txfm_add_16x4_adst_dct_1_8bpc_ssse3: 171.2
    inv_txfm_add_16x4_adst_dct_2_8bpc_c: 2122.8
    inv_txfm_add_16x4_adst_dct_2_8bpc_ssse3: 171.8
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_c: 2213.3
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_ssse3: 189.6
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_c: 2227.7
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_c: 2228.5
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_identity_0_8bpc_c: 1906.7
    inv_txfm_add_16x4_adst_identity_0_8bpc_ssse3: 154.3
    inv_txfm_add_16x4_adst_identity_1_8bpc_c: 1905.2
    inv_txfm_add_16x4_adst_identity_1_8bpc_ssse3: 155.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_c: 1905.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_ssse3: 156.3
    inv_txfm_add_16x4_dct_adst_0_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_0_8bpc_ssse3: 37.4
    inv_txfm_add_16x4_dct_adst_1_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_1_8bpc_ssse3: 157.9
    inv_txfm_add_16x4_dct_adst_2_8bpc_c: 2221.1
    inv_txfm_add_16x4_dct_adst_2_8bpc_ssse3: 158.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_c: 2177.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_ssse3: 29.6
    inv_txfm_add_16x4_dct_dct_1_8bpc_c: 2179.3
    inv_txfm_add_16x4_dct_dct_1_8bpc_ssse3: 144.9
    inv_txfm_add_16x4_dct_dct_2_8bpc_c: 2177.8
    inv_txfm_add_16x4_dct_dct_2_8bpc_ssse3: 143.7
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_c: 2293.6
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_ssse3: 38.3
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_c: 2293.2
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_ssse3: 163.9
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_c: 2301.3
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_ssse3: 163.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_c: 1977.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_ssse3: 39.9
    inv_txfm_add_16x4_dct_identity_1_8bpc_c: 1978.7
    inv_txfm_add_16x4_dct_identity_1_8bpc_ssse3: 126.8
    inv_txfm_add_16x4_dct_identity_2_8bpc_c: 1979.5
    inv_txfm_add_16x4_dct_identity_2_8bpc_ssse3: 128.1
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_c: 2175.6
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_ssse3: 185.1
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_c: 2175.7
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_ssse3: 185.7
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_c: 2173.1
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_ssse3: 185.0
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_c: 2140.5
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_c: 2147.5
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_ssse3: 171.9
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_c: 2148.5
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_c: 2240.6
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_ssse3: 191.3
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_c: 2243.5
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_ssse3: 193.2
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_c: 2242.9
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_ssse3: 192.0
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_c: 1919.2
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_ssse3: 155.1
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_c: 1925.2
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_ssse3: 155.2
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_c: 2084.8
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_ssse3: 155.0
    inv_txfm_add_16x4_identity_adst_0_8bpc_c: 1498.5
    inv_txfm_add_16x4_identity_adst_0_8bpc_ssse3: 107.6
    inv_txfm_add_16x4_identity_adst_1_8bpc_c: 1499.5
    inv_txfm_add_16x4_identity_adst_1_8bpc_ssse3: 107.0
    inv_txfm_add_16x4_identity_adst_2_8bpc_c: 1498.9
    inv_txfm_add_16x4_identity_adst_2_8bpc_ssse3: 107.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_c: 1471.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_ssse3: 45.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_c: 1476.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_ssse3: 45.5
    inv_txfm_add_16x4_identity_dct_2_8bpc_c: 1459.8
    inv_txfm_add_16x4_identity_dct_2_8bpc_ssse3: 92.3
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_c: 1548.7
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_ssse3: 112.1
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_c: 1548.2
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_ssse3: 111.7
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_c: 1547.2
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_ssse3: 114.1
    inv_txfm_add_16x4_identity_identity_0_8bpc_c: 1271.5
    inv_txfm_add_16x4_identity_identity_0_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_1_8bpc_c: 1266.8
    inv_txfm_add_16x4_identity_identity_1_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_2_8bpc_c: 1268.0
    inv_txfm_add_16x4_identity_identity_2_8bpc_ssse3: 74.6
    bf659082
  • Ronald S. Bultje's avatar
    Don't filter top/left intra edge if intra_edge_filter=0 · 0a8df458
    Ronald S. Bultje authored
    Fixes #236.
    0a8df458
......@@ -70,7 +70,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
2. Run `meson build --buildtype release`
3. Build with `ninja -C build`
......
......@@ -311,8 +311,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
nasm_r = run_command(nasm, '-v')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13')
error('nasm 2.13 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
......
......@@ -3172,6 +3172,12 @@ int dav1d_submit_frame(Dav1dContext *const c) {
}
// FIXME qsort so tiles are in order (for frame threading)
if (f->n_tile_data_alloc < c->n_tile_data) {
struct Dav1dTileGroup *tile = realloc(f->tile, c->n_tile_data * sizeof(*f->tile));
if (!tile) goto error;
f->tile = tile;
f->n_tile_data_alloc = c->n_tile_data;
}
memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
f->n_tile_data = c->n_tile_data;
......
......@@ -65,16 +65,19 @@ typedef struct Dav1dDSPContext {
Dav1dLoopRestorationDSPContext lr;
} Dav1dDSPContext;
struct Dav1dTileGroup {
Dav1dData data;
int start, end;
};
struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
int n_tiles;
Dav1dRef *seq_hdr_ref;
......@@ -139,10 +142,8 @@ struct Dav1dFrameContext {
unsigned refpoc[7], refrefpoc[7][7];
uint8_t gmv_warp_allowed[7];
CdfThreadContext in_cdf, out_cdf;
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
// for scalable references
......
......@@ -81,8 +81,8 @@ enum IntraPredMode
const pixel *dst, ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *angle,
int tw, int th, pixel *topleft_out
HIGHBD_DECL_SUFFIX);
int tw, int th, int filter_edge,
pixel *topleft_out HIGHBD_DECL_SUFFIX);
// These flags are OR'd with the angle argument into intra predictors.
// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
......
......@@ -82,7 +82,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
const ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *const angle,
const int tw, const int th,
const int tw, const int th, const int filter_edge,
pixel *const topleft_out HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
......@@ -201,7 +201,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
} else {
*topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
}
if (mode == Z2_PRED && tw + th >= 6)
if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
*topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
topleft_out[1] * 5 + 8) >> 4;
}
......
......@@ -422,7 +422,7 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle < 90);
int dx = dav1d_dr_intra_derivative[angle];
int dx = dav1d_dr_intra_derivative[angle >> 1];
pixel top_out[(64 + 64) * 2];
const pixel *top;
int max_base_x;
......@@ -476,8 +476,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 90 && angle < 180);
int dy = dav1d_dr_intra_derivative[angle - 90];
int dx = dav1d_dr_intra_derivative[180 - angle];
int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
const int upsample_left = enable_intra_edge_filter ?
get_upsample(width + height, 180 - angle, is_sm) : 0;
const int upsample_above = enable_intra_edge_filter ?
......@@ -557,7 +557,7 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 180);
int dy = dav1d_dr_intra_derivative[270 - angle];
int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
pixel left_out[(64 + 64) * 2];
const pixel *left;
int max_base_y;
......
......@@ -473,6 +473,7 @@ void dav1d_close(Dav1dContext **const c_out) {
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
free(f->tile);
free(f->lf.mask);
free(f->lf.lr_mask);
free(f->lf.level);
......@@ -491,6 +492,7 @@ void dav1d_close(Dav1dContext **const c_out) {
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref_internal(&c->tile[n].data);
free(c->tile);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
if (c->refs[n].p.p.data[0])
......
......@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src,
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = x;
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
}
AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE;
......
......@@ -221,7 +221,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
if (hdr->monochrome) {
hdr->color_range = dav1d_get_bits(gb, 1);
hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
hdr->ss_hor = hdr->ss_ver = 0;
hdr->ss_hor = hdr->ss_ver = 1;
hdr->chr = DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = 0;
} else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
......@@ -258,8 +258,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
}
hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
}
hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-colorinfo: off=%ld\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
......@@ -1311,7 +1311,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
case OBU_TILE_GRP: {
if (global) break;
if (!c->frame_hdr) goto error;
if (c->n_tile_data >= 256) goto error;
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
if (!tile) goto error;
c->tile = tile;
memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
c->n_tile_data_alloc = c->n_tile_data + 1;
}
parse_tile_hdr(c, &gb);
// Align to the next byte boundary and check for overrun.
dav1d_bytealign_get_bits(&gb);
......
......@@ -833,8 +833,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst,
f->cur.stride[0], top_sb_edge,
b->y_mode, &angle,
t_dim->w, t_dim->h, edge
HIGHBD_CALL_SUFFIX);
t_dim->w, t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
t_dim->w * 4, t_dim->h * 4,
angle | intra_flags,
......@@ -951,9 +952,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uv_dst[pl], stride,
top_sb_edge, DC_PRED, &angle,
uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->w, uv_t_dim->h, 0,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
uv_t_dim->w * 4,
uv_t_dim->h * 4,
......@@ -1053,8 +1053,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst, stride,
top_sb_edge, uv_mode,
&angle, uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
angle |= intra_edge_filter_flag;
dsp->ipred.intra_pred[m](dst, stride, edge,
uv_t_dim->w * 4,
......@@ -1216,7 +1217,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
t->by, t->by > ts->tiling.row_start,
ts->tiling.col_end, ts->tiling.row_end,
0, dst, f->cur.stride[0], top_sb_edge,
m, &angle, bw4, bh4, tl_edge
m, &angle, bw4, bh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
......@@ -1358,7 +1359,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uvdst, f->cur.stride[1],
top_sb_edge, m,
&angle, cbw4, cbh4, tl_edge
&angle, cbw4, cbh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
......
......@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 },
};
const int dav1d_sgr_x_by_xplus1[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
256,
const uint8_t dav1d_sgr_x_by_x[256] = {
255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0
};
const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
......@@ -775,37 +775,36 @@ const uint8_t dav1d_sm_weights[128] = {
7, 6, 6, 5, 5, 4, 4, 4
};
const int16_t dav1d_dr_intra_derivative[90] = {
// More evenly spread out angles and limited to 10-bit
const uint16_t dav1d_dr_intra_derivative[44] = {
// Values that are 0 will never be used
0, 0, 0, // Approx angle
1023, 0, 0, // 3, ...
547, 0, 0, // 6, ...
372, 0, 0, 0, 0, // 9, ...
273, 0, 0, // 14, ...
215, 0, 0, // 17, ...
178, 0, 0, // 20, ...
151, 0, 0, // 23, ... (113 & 203 are base angles)
132, 0, 0, // 26, ...
116, 0, 0, // 29, ...
102, 0, 0, 0, // 32, ...
90, 0, 0, // 36, ...
80, 0, 0, // 39, ...
71, 0, 0, // 42, ...
64, 0, 0, // 45, ... (45 & 135 are base angles)
57, 0, 0, // 48, ...
51, 0, 0, // 51, ...
45, 0, 0, 0, // 54, ...
40, 0, 0, // 58, ...
35, 0, 0, // 61, ...
31, 0, 0, // 64, ...
27, 0, 0, // 67, ... (67 & 157 are base angles)
23, 0, 0, // 70, ...
19, 0, 0, // 73, ...
15, 0, 0, 0, 0, // 76, ...
11, 0, 0, // 81, ...
7, 0, 0, // 84, ...
3, 0, 0, // 87, ...
0, // Angles:
1023, 0, // 3, 93, 183
547, // 6, 96, 186
372, 0, 0, // 9, 99, 189
273, // 14, 104, 194
215, 0, // 17, 107, 197
178, // 20, 110, 200
151, 0, // 23, 113, 203 (113 & 203 are base angles)
132, // 26, 116, 206
116, 0, // 29, 119, 209
102, 0, // 32, 122, 212
90, // 36, 126, 216
80, 0, // 39, 129, 219
71, // 42, 132, 222
64, 0, // 45, 135, 225 (45 & 135 are base angles)
57, // 48, 138, 228
51, 0, // 51, 141, 231
45, 0, // 54, 144, 234
40, // 58, 148, 238
35, 0, // 61, 151, 241
31, // 64, 154, 244
27, 0, // 67, 157, 247 (67 & 157 are base angles)
23, // 70, 160, 250
19, 0, // 73, 163, 253
15, 0, // 76, 166, 256
11, 0, // 81, 171, 261
7, // 84, 174, 264
3 // 87, 177, 267
};
const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
......
......@@ -107,14 +107,14 @@ static const unsigned interintra_allowed_mask =
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int16_t dav1d_sgr_params[16][4];
extern const int dav1d_sgr_x_by_xplus1[256];
extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
extern const int16_t dav1d_resize_filter[64][8];
extern const uint8_t dav1d_sm_weights[128];
extern const int16_t dav1d_dr_intra_derivative[90];
extern const uint16_t dav1d_dr_intra_derivative[44];
extern const int8_t dav1d_filter_intra_taps[5][64];
extern const uint8_t dav1d_obmc_masks[64];
......
This diff is collapsed.
......@@ -39,6 +39,7 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
......@@ -86,6 +87,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2;
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;
c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2;
......
......@@ -77,10 +77,12 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
decl_itx17_fns(4, 4, ssse3);
decl_itx16_fns(4, 8, ssse3);
decl_itx16_fns(8, 4, ssse3);
decl_itx16_fns(8, 8, ssse3);
decl_itx17_fns( 4, 4, ssse3);
decl_itx16_fns( 4, 8, ssse3);
decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
......@@ -124,10 +126,12 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
assign_itx16_fn(, 8, 8, ssse3);
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
......@@ -54,9 +54,20 @@ COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189
COEF_PAIR 3784, 1567
COEF_PAIR 995, 3973
COEF_PAIR 1751, 3703
COEF_PAIR 3513, 2106
COEF_PAIR 3857, 1380
COEF_PAIR 4017, 799
COEF_PAIR 201, 4091
COEF_PAIR 2440, 3290
COEF_PAIR 3035, 2751
COEF_PAIR 4052, 601
COEF_PAIR 2276, 3406
pd_2048: times 4 dd 2048
pw_2048: times 8 dw 2048
pw_m2048: times 8 dw -2048
pw_4096: times 8 dw 4096
pw_16384: times 8 dw 16384
pw_m16384: times 8 dw -16384
......@@ -112,18 +123,18 @@ SECTION .text
punpcklbw m%3, m%5 ;extend byte to word
punpcklbw m%4, m%5 ;extend byte to word
paddw m%1, m%3 ;high: dst1 + out1 ;low: dst0 + out0
paddw m%2, m%4 ;high: dst3 + out3 ;low: dst2 + out2
paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
packuswb m%1, m%2 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
movd [%%row_adr1], m%1 ;store dst0 + out0
pshuflw m%2, m%1, q1032
movd [%%row_adr2], m%2 ;store dst1 + out1
punpckhqdq m%1, m%1
movd [%%row_adr3], m%1 ;store dst2 + out2
psrlq m%1, 32
movd [%%row_adr4], m%1 ;store dst3 + out3
movd [%%row_adr1], m%3 ;store dst0 + out0
pshuflw m%4, m%3, q1032
movd [%%row_adr2], m%4 ;store dst1 + out1
punpckhqdq m%3, m%3
movd [%%row_adr3], m%3 ;store dst2 + out2
psrlq m%3, 32
movd [%%row_adr4], m%3 ;store dst3 + out3
%endmacro
%macro ITX4_END 4-5 2048 ; row[1-4], rnd
......@@ -709,9 +720,9 @@ cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(idct_8x4_internal).main
call m(iadst_4x8_internal).inversion
jmp tx2q
jmp m(iadst_4x8_internal).pass1_end
.pass2:
call .main
......@@ -738,8 +749,11 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(iadst_8x4_internal).main
call .inversion
.pass1_end:
INV_4X8
jmp tx2q
.pass2:
......@@ -775,11 +789,6 @@ ALIGN function_align
IADST8_1D_PACKED
ret
ALIGN function_align
.inversion:
INV_4X8
ret
INV_TXFM_4X8_FN flipadst, dct, 0
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
......@@ -792,6 +801,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(iadst_8x4_internal).main
punpcklwd m4, m3, m2
......@@ -832,6 +842,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
.pass1:
mova m5, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
......@@ -842,8 +853,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m5
pmulhrsw m3, m5
call m(iadst_4x8_internal).inversion
jmp tx2q
jmp m(iadst_4x8_internal).pass1_end
.pass2:
mova m4, [o(pw_4096)]
......@@ -1476,3 +1486,746 @@ ALIGN function_align
mova [coeffq+16*6], m6
mova [coeffq+16*7], m7
jmp m(idct_8x8_internal).end3
%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x16, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
mova m1, m0
pmulhrsw m0, [coeffq+16*0]
pmulhrsw m1, [coeffq+16*1]
mova m2, [o(pw_16384)]
mova m3, [o(pw_5793x4)]
mova m4, [o(pw_2048)]
pmulhrsw m0, m2
pmulhrsw m1, m2
psllw m0, 2
psllw m1, 2
pmulhrsw m0, m3
pmulhrsw m1, m3
pmulhrsw m0, m4
pmulhrsw m4, m1
punpckhwd m2, m0, m0
punpcklwd m0, m0
punpckhwd m6, m4, m4
punpcklwd m4, m4
punpckhdq m1, m0, m0
punpckldq m0, m0
punpckhdq m3, m2, m2
punpckldq m2, m2
punpckhdq m5, m4, m4
punpckldq m4, m4
punpckhdq m7, m6, m6
punpckldq m6, m6
mova [coeffq+16*4], m4
TAIL_CALL m(iadst_4x16_internal).end2
%elifidn %1_%2, identity_dct
movd m0, [coeffq+32*0]
punpcklwd m0, [coeffq+32*1]
movd m1, [coeffq+32*2]
punpcklwd m1, [coeffq+32*3]
mova m2, [o(pw_5793x4)]
mova m3, [o(pw_16384)]
mova m4, [o(pw_2896x8)]
punpckldq m0, m1
paddw m0, m0
pmulhrsw m0, m2
pmulhrsw m0, m3
psrlw m3, 3 ; pw_2048
pmulhrsw m0, m4
pmulhrsw m0, m3
punpcklqdq m0, m0
pxor m7, m7
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3
%elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
mov [coeffq], eobd
pmulhrsw m0, [o(pw_16384)]
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_2048)]
%else ; adst_dct / flipadst_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
%ifidn %1, adst
pmulhrsw m0, [o(iadst4_dconly1a)]
%else ; flipadst
pmulhrsw m0, [o(iadst4_dconly1b)]
%endif
mova m1, [o(pw_16384)]
mov [coeffq], eobd
pmulhrsw m0, m1
psrlw m1, 3 ; pw_2048
pmulhrsw m0, [o(pw_2896x8)]
pmulhrsw m0, m1
%endif
.end:
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
RET
%endif
%endmacro
INV_TXFM_4X16_FN dct, dct, 0
INV_TXFM_4X16_FN dct, identity, 15
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(idct_4x8_internal).pass1)]
.pass1:
mova m0, [coeffq+16*1]
mova m1, [coeffq+16*3]
mova m2, [coeffq+16*5]
mova m3, [coeffq+16*7]
push tx2q
lea tx2q, [o(m(idct_4x16_internal).pass1_2)]
jmp r3
.pass1_2:
mova [coeffq+16*1], m0
mova [coeffq+16*3], m1
mova [coeffq+16*5], m2
mova [coeffq+16*7], m3
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*2]
mova m2, [coeffq+16*4]
mova m3, [coeffq+16*6]
lea tx2q, [o(m(idct_4x16_internal).pass1_end)]
jmp r3
.pass1_end:
pop tx2q
mova m4, [coeffq+16*1]
mova m5, [coeffq+16*3]
mova m6, [coeffq+16*5]
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*7], m7
jmp tx2q
.pass2:
call m(idct_16x4_internal).main
.end:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*4], m4
.end1:
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
mov r3, coeffq
WRITE_4X8 0, 1, 3, 2
mova m0, [r3+16*4]
mova m1, [r3+16*5]
mova m2, [r3+16*6]
mova m3, m7
lea dstq, [dstq+strideq*4]
WRITE_4X8 0, 1, 3, 2
.end2:
pxor m7, m7
REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
INV_TXFM_4X16_FN adst, dct, 0
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iadst_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
call m(iadst_16x4_internal).main
punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
punpckhqdq m4, m5 ;low: out8 high: out10
punpcklqdq m5, m7, m2 ;low: out4 high: out6
punpckhqdq m2, m7 ;low: -out9 high: -out11
mova [coeffq+16*4], m2
mova [coeffq+16*5], m6
mova m2, [coeffq+16*6]
mova m6, [coeffq+16*7]
punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
punpcklqdq m0, m6 ;low: out0 high: out2
punpckhqdq m6, m3, m2 ;low: out12 high: out14
punpcklqdq m2, m3 ;low: -out1 high: -out3
mova m7, [o(pw_2048)]
.end1:
REPX {pmulhrsw x, m7}, m0, m5, m4, m6
pxor m3, m3
psubw m3, m7
mova m7, [coeffq+16*4]
REPX {pmulhrsw x, m3}, m2, m7, m1
pmulhrsw m3, [coeffq+16*5]
mova [coeffq+16*7], m5
punpckhqdq m5, m4, m7 ;low: out10 high: out11
punpcklqdq m4, m7 ;low: out8 high: out9
punpckhqdq m7, m6, m1 ;low: out14 high: out15
punpcklqdq m6, m1 ;low: out12 high: out13
punpckhqdq m1, m0, m2 ;low: out2 high: out3
punpcklqdq m0, m2 ;low: out0 high: out1
mova [coeffq+16*4], m4
mova m4, [coeffq+16*7]
punpcklqdq m2, m4, m3 ;low: out4 high: out5
punpckhqdq m4, m3 ;low: out6 high: out7
mova m3, m4
.end2:
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
mov r3, coeffq
WRITE_4X8 0, 1, 2, 3
mova m0, [r3+16*4]
mova m1, [r3+16*5]
mova m2, [r3+16*6]
mova m3, m7
lea dstq, [dstq+strideq*4]
WRITE_4X8 0, 1, 2, 3
.end3:
pxor m7, m7
REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
INV_TXFM_4X16_FN flipadst, dct, 0
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iflipadst_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
call m(iadst_16x4_internal).main
punpckhqdq m6, m5, m4 ;low: out5 high: out7
punpcklqdq m4, m5 ;low: -out8 high: -out10
punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
punpcklqdq m2, m7 ;low: out9 high: out11
mova [coeffq+16*4], m2
mova [coeffq+16*5], m6
mova m2, [coeffq+16*6]
mova m6, [coeffq+16*7]
punpcklqdq m1, m6, m0 ;low: out13 high: out15
punpckhqdq m0, m6 ;low: -out0 high: -out2
punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
punpckhqdq m2, m3 ;low: out1 high: out3
mova m7, [o(pw_m2048)]
jmp m(iadst_4x16_internal).end1
INV_TXFM_4X16_FN identity, dct, 3
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iidentity_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
psllw m7, [coeffq+16*7], 2
pmulhrsw m7, [o(pw_5793x4)]
mova [coeffq+16*7], m7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*4], m4
jmp m(iadst_4x16_internal).end2
%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x4, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m3, [o(pw_2896x8)]
pmulhrsw m3, [coeffq]
mova m0, [o(pw_16384)]
pmulhrsw m3, m0
psrlw m0, 3 ; pw_2048
paddw m3, m3
pmulhrsw m3, [o(pw_5793x4)]
pmulhrsw m3, m0
punpcklwd m3, m3
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
lea tx2q, [dstq+8]
call m(iadst_8x4_internal).end2
add coeffq, 16*4
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end2
%elifidn %1_%2, identity_dct
mova m5, [o(pw_16384)]
mova m6, [o(pw_5793x4)]
mova m7, [o(pw_2896x8)]
mov r3d, 2
.main_loop:
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpcklwd m0, m4
punpcklwd m2, m1
punpcklqdq m0, m2
psllw m0, 2
pmulhrsw m0, m6
pmulhrsw m0, m5
psrlw m1, m5, 3 ; pw_2048
pmulhrsw m0, m7
pmulhrsw m0, m1
.end:
pxor m3, m3
mova [coeffq+16*0], m3
mova [coeffq+16*1], m3
mova [coeffq+16*2], m3
mova [coeffq+16*3], m3
add coeffq, 16*4
lea tx2q, [dstq+8]
WRITE_8X4 0, 0, 0, 0, 1, 2, 3
mov dstq, tx2q
dec r3d
jg .main_loop
RET
%else
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
%ifidn %2, dct
movd m2, [o(pw_16384)]
mov [coeffq], eobd
mov r2d, 2
.dconly:
pmulhrsw m0, m2
movd m2, [o(pw_2048)] ;intentionally rip-relative
pmulhrsw m0, m1
pmulhrsw m0, m2
pshuflw m0, m0, q0000
punpcklwd m0, m0
pxor m5, m5
.dconly_loop:
mova m1, [dstq]
mova m3, [dstq+strideq]
punpckhbw m2, m1, m5
punpcklbw m1, m5
punpckhbw m4, m3, m5
punpcklbw m3, m5
paddw m2, m0
paddw m1, m0
paddw m4, m0
paddw m3, m0
packuswb m1, m2
packuswb m3, m4
mova [dstq], m1
mova [dstq+strideq], m3
lea dstq, [dstq+strideq*2]
dec r2d
jg .dconly_loop
RET
%else ; adst / flipadst
movd m2, [o(pw_16384)]
pmulhrsw m0, m2
pshuflw m0, m0, q0000
punpcklwd m0, m0
mov [coeffq], eobd
pmulhrsw m2, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [o(iadst4_dconly2a)]
mova m1, [o(pw_2048)]
pmulhrsw m0, m1
pmulhrsw m2, m1
%ifidn %2, adst
punpckhqdq m1, m0, m0
punpcklqdq m0, m0
punpckhqdq m3, m2, m2
punpcklqdq m2, m2
%else ; flipadst
mova m3, m0
punpckhqdq m0, m2, m2
punpcklqdq m1, m2, m2
punpckhqdq m2, m3, m3
punpcklqdq m3, m3
%endif
lea tx2q, [dstq+8]
call m(iadst_8x4_internal).end3
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end3
%endif
%endif
%endif
%endmacro
%macro ITX_16X4_LOAD_COEFS 0
ITX_8X8_LOAD_COEFS
%endmacro
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
punpckhwd m%5, m%4, m%1 ;packed in13 in3
punpcklwd m%1, m%4 ;packed in1 in15
punpcklwd m%6, m%3, m%2 ;packed in9 in7
punpckhwd m%2, m%3 ;packed in5 in11
mova m%7, [o(pd_2048)]
ITX_MUL2X_PACK %1, %4, %7, 401, 4076, 1 ;low: t8a high: t15a
ITX_MUL2X_PACK %6, %4, %7, 3166, 2598, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %2, %4, %7, 1931, 3612, 1 ;low: t10a high: t13a
ITX_MUL2X_PACK %5, %4, %7, 3920, 1189, 1 ;low: t11a high: t12a
psubsw m%4, m%1, m%6 ;low: t9 high: t14
paddsw m%1, m%6 ;low: t8 high: t15
psubsw m%3, m%5, m%2 ;low: t10 high: t13
paddsw m%2, m%5 ;low: t11 high: t12
punpcklqdq m%5, m%4, m%3 ;low: t9 high: t10
punpckhqdq m%4, m%3 ;low: t14 high: t13
punpcklwd m%6, m%4, m%5 ;packed t14 t9
punpckhwd m%5, m%4 ;packed t10 t13
pxor m%4, m%4
psubw m%4, m%5 ;packed -t10 -t13
ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a
psubsw m%3, m%1, m%2 ;low: t11a high: t12a
paddsw m%1, m%2 ;low: t8a high: t15a
psubsw m%5, m%6, m%4 ;low: t10 high: t13
paddsw m%6, m%4 ;low: t9 high: t14
mova m%7, [o(pw_2896x8)]
punpckhqdq m%4, m%3, m%5 ;low: t12a high: t13
punpcklqdq m%3, m%5 ;low: t11a high: t10
psubw m%2, m%4, m%3
paddw m%3, m%4
pmulhrsw m%2, m%7 ;low: t11 high: t10a
pmulhrsw m%3, m%7 ;low: t12 high: t13a
punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
punpcklqdq m%1, m%6 ;low: t8a high: t9
%endmacro
INV_TXFM_16X4_FN dct, dct, 0
INV_TXFM_16X4_FN dct, adst, 0
INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3
cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X4_LOAD_COEFS
call .main
.pass1_end:
punpckhwd m7, m0, m2 ;packed out1, out5
punpcklwd m0, m2 ;packed out0, out4
punpcklwd m2, m1, m3 ;packed out3, out7
punpckhwd m1, m3 ;packed out2, out6
mova [coeffq+16*6], m7
mova m7, [coeffq+16*7]
punpckhwd m3, m4, m6 ;packed out9, out13
punpcklwd m4, m6 ;packed out8, out12
punpcklwd m6, m5, m7 ;packed out11, out15
punpckhwd m5, m7 ;packed out10, out14
.pass1_end2:
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*6]
mova [coeffq+16*6], m7
.pass1_end3:
punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
mova [coeffq+16*7], m3
mova m3, [coeffq+16*6]
punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
jmp tx2q
.pass2:
lea tx2q, [o(m(idct_8x4_internal).pass2)]
.pass2_end:
mova [coeffq+16*4], m4
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
lea r3, [dstq+8]
call tx2q
add coeffq, 16*4
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mov dstq, r3
jmp tx2q
ALIGN function_align
.main:
punpckhqdq m7, m0, m1 ;low:in1 high:in3
punpcklqdq m0, m1
punpcklqdq m1, m2, m3
punpckhqdq m3, m2 ;low:in7 high:in5
mova [coeffq+16*4], m7
mova [coeffq+16*5], m3
mova m7, [coeffq+16*7]
punpcklqdq m2, m4, m5
punpckhqdq m4, m5 ;low:in9 high:in11
punpcklqdq m3, m6, m7
punpckhqdq m7, m6 ;low:in15 high:in13
mova [coeffq+16*6], m4
IDCT8_1D_PACKED
mova m6, [coeffq+16*4]
mova m4, [coeffq+16*5]
mova m5, [coeffq+16*6]
mova [coeffq+16*4], m1
mova [coeffq+16*5], m2
mova [coeffq+16*6], m3
IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
mova m1, [coeffq+16*4]
psubsw m3, m0, m7 ;low:out15 high:out14
paddsw m0, m7 ;low:out0 high:out1
psubsw m7, m1, m5 ;low:out12 high:out13
paddsw m1, m5 ;low:out3 high:out2
mova [coeffq+16*7], m3
mova m2, [coeffq+16*5]
mova m3, [coeffq+16*6]
psubsw m5, m2, m4 ;low:out11 high:out10
paddsw m2, m4 ;low:out4 high:out5
psubsw m4, m3, m6 ;low:out8 high:out9
paddsw m3, m6 ;low:out7 high:out6
mova m6, m7
ret
INV_TXFM_16X4_FN adst, dct
INV_TXFM_16X4_FN adst, adst
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X4_LOAD_COEFS
call .main
punpckhwd m6, m7, m0 ;packed -out11, -out15
punpcklwd m0, m7 ;packed out0, out4
punpcklwd m7, m3, m4 ;packed -out3, -out7
punpckhwd m4, m3 ;packed out8, out12
mova m1, [coeffq+16*6]
punpcklwd m3, m1, m5 ;packed -out1, -out5
punpckhwd m5, m1 ;packed out10, out14
mova m1, [coeffq+16*7]
mova [coeffq+16*6], m3
mova [coeffq+16*7], m7
punpckhwd m3, m2, m1 ;packed -out9, -out13
punpcklwd m1, m2 ;packed out2, out6
mova m7, [o(pw_16384)]
.pass1_end:
REPX {pmulhrsw x, m7}, m0, m1, m4, m5
pxor m2, m2
psubw m2, m7
mova m7, [coeffq+16*6]
REPX {pmulhrsw x, m2}, m7, m3, m6
pmulhrsw m2, [coeffq+16*7]
mova [coeffq+16*6], m7
jmp m(idct_16x4_internal).pass1_end3
.pass2:
lea tx2q, [o(m(iadst_8x4_internal).pass2)]
jmp m(idct_16x4_internal).pass2_end
ALIGN function_align
.main:
mova [coeffq+16*6], m0
pshufd m1, m1, q1032
pshufd m2, m2, q1032
punpckhwd m0, m6, m1 ;packed in13, in2
punpcklwd m1, m6 ;packed in3, in12
punpckhwd m6, m5, m2 ;packed in11, in4
punpcklwd m2, m5 ;packed in5, in10
mova m7, [o(pd_2048)]
ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3
ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5
ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11
ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13
psubsw m5, m0, m2 ;low:t10a high:t11a
paddsw m0, m2 ;low:t2a high:t3a
psubsw m2, m6, m1 ;low:t12a high:t13a
paddsw m6, m1 ;low:t4a high:t5a
punpcklqdq m1, m5
punpckhwd m1, m5 ;packed t10a, t11a
punpcklqdq m5, m2
punpckhwd m2, m5 ;packed t13a, t12a
ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11
ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13
mova [coeffq+16*4], m0
mova [coeffq+16*5], m6
mova m0, [coeffq+16*6]
mova m6, [coeffq+16*7]
pshufd m0, m0, q1032
pshufd m3, m3, q1032
punpckhwd m5, m6, m0 ;packed in15, in0
punpcklwd m0, m6 ;packed in1, in14
punpckhwd m6, m4, m3 ;packed in9, in6
punpcklwd m3, m4 ;packed in7, in8
ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1
ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7
ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9
ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15
psubsw m4, m5, m3 ;low:t8a high:t9a
paddsw m5, m3 ;low:t0a high:t1a
psubsw m3, m6, m0 ;low:t14a high:t15a
paddsw m6, m0 ;low:t6a high:t7a
punpcklqdq m0, m4
punpckhwd m0, m4 ;packed t8a, t9a
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t15a, t14a
ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9
ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15
psubsw m4, m0, m2 ;low:t12a high:t13a
paddsw m0, m2 ;low:t8a high:t9a
psubsw m2, m1, m3 ;low:t14a high:t15a
paddsw m1, m3 ;low:t10a high:t11a
punpcklqdq m3, m4
punpckhwd m3, m4 ;packed t12a, t13a
punpcklqdq m4, m2
punpckhwd m2, m4 ;packed t15a, t14a
ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13
ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15
psubsw m4, m0, m1 ;low:t10 high:t11
paddsw m0, m1 ;low:-out1 high:out14
psubsw m1, m3, m2 ;low:t14a high:t15a
paddsw m3, m2 ;low:out2 high:-out13
punpckhqdq m2, m4, m1 ;low:t11 high:t15a
punpcklqdq m4, m1 ;low:t10 high:t14a
psubw m1, m4, m2
paddw m2, m4
mova [coeffq+16*6], m0
mova [coeffq+16*7], m3
mova m0, [coeffq+16*4]
mova m3, [coeffq+16*5]
psubsw m4, m5, m3 ;low:t4 high:t5
paddsw m5, m3 ;low:t0 high:t1
psubsw m3, m0 ,m6 ;low:t6 high:t7
paddsw m0, m6 ;low:t2 high:t3
punpcklqdq m6, m4
punpckhwd m6, m4 ;packed t4, t5
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t7, t6
ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a
ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a
psubsw m4, m5, m0 ;low:t2a high:t3a
paddsw m0, m5 ;low:out0 high:-out15
psubsw m5, m6, m3 ;low:t6 high:t7
paddsw m3, m6 ;low:-out3 high:out12
mova m7, [o(pw_2896x8)]
punpckhqdq m6, m4, m5 ;low:t3a high:t7
punpcklqdq m4, m5 ;low:t2a high:t6
psubw m5, m4, m6
paddw m4, m6
pmulhrsw m1, m7 ;low:-out9 high:out10
pmulhrsw m2, m7 ;low:out6 high:-out5
pmulhrsw m5, m7 ;low:out8 high:-out11
pmulhrsw m4, m7 ;low:-out7 high:out4
punpckhqdq m7, m4, m5 ;low:out4 high:-out11
punpcklqdq m4, m5 ;low:-out7 high:out8
punpckhqdq m5, m2, m1 ;low:-out5 high:out10
punpcklqdq m2, m1 ;low:out6 high:-out9
ret
INV_TXFM_16X4_FN flipadst, dct
INV_TXFM_16X4_FN flipadst, adst
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X4_LOAD_COEFS
call m(iadst_16x4_internal).main
punpcklwd m6, m7, m0 ;packed out11, out15
punpckhwd m0, m7 ;packed -out0, -out4
punpckhwd m7, m3, m4 ;packed out3, out7
punpcklwd m4, m3 ;packed -out8, -out12
mova m1, [coeffq+16*6]
punpckhwd m3, m1, m5 ;packed out1, out5
punpcklwd m5, m1 ;packed -out10, -out14
mova m1, [coeffq+16*7]
mova [coeffq+16*6], m3
mova [coeffq+16*7], m7
punpcklwd m3, m2, m1 ;packed out9, out13
punpckhwd m1, m2 ;packed -out2, -out6
mova m7, [o(pw_m16384)]
jmp m(iadst_16x4_internal).pass1_end
.pass2:
lea tx2q, [o(m(iflipadst_8x4_internal).pass2)]
jmp m(idct_16x4_internal).pass2_end
INV_TXFM_16X4_FN identity, dct, 15
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X4_LOAD_COEFS
mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
punpckhwd m7, m0, m2 ;packed out1, out5
punpcklwd m0, m2 ;packed out0, out4
punpckhwd m2, m1, m3 ;packed out3, out7
punpcklwd m1, m3 ;packed out2, out6
mova [coeffq+16*6], m7
psllw m7, [coeffq+16*7], 2