Skip to content
Commits on Source (81)
......@@ -255,6 +255,17 @@ build-ubuntu-snap:
expire_in: 1 week
allow_failure: true
build-debian-ppc64le:
stage: build
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
tags:
- ppc64le
- docker
script:
- meson build --buildtype release --werror
- ninja -C build
- cd build && meson test -v
test-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
......@@ -312,7 +323,7 @@ test-debian-asan:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=address -Dbuild_asm=false
- meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=address -Denable_asm=false
- ninja -C build
- cd build && time meson test -v --setup=sanitizer
dependencies: []
......@@ -334,7 +345,7 @@ test-debian-msan:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false
- env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=memory -Db_lundef=false -Denable_asm=false
- ninja -C build
- cd build && time meson test -v --setup=sanitizer
dependencies: []
......@@ -356,7 +367,7 @@ test-debian-ubsan:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false
- env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=undefined -Db_lundef=false -Denable_asm=false
- ninja -C build
- cd build && time meson test -v --setup=sanitizer
dependencies: []
......@@ -384,3 +395,69 @@ test-win64:
- ninja -C build
- cd build && time meson test -v
dependencies: []
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
stage: test
tags:
- aarch64
- debian
cache:
key: testdata.git-20190215
paths:
- cache/dav1d-test-data.git/
script:
- test -d cache || mkdir cache
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build
- cd build && time meson test -v
dependencies: []
test-debian-ppc64le:
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
stage: test
tags:
- ppc64le
- docker
cache:
key: testdata.git-20190215
paths:
- cache/dav1d-test-data.git/
script:
- test -d cache || mkdir cache
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build
- cd build && time meson test -v
dependencies: []
test-debian-armv7-clang-5:
stage: test
image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
tags:
- armv7
- debian
cache:
key: testdata.git-20190215
paths:
- cache/dav1d-test-data.git/
script:
- test -d cache || mkdir cache
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build
- cd build && time meson test -v
dependencies: []
Changes for 0.4.0 'Cheetah':
----------------------------
- Fix playback with unknown OBUs
- Add an option to limit the maximum frame size
- SSE2 and ARM64 optimizations for MSAC
- Improve speed on 32bits systems
- Optimization in obmc blend
- Reduce RAM usage significantly
- The initial PPC SIMD code, cdef_filter
- NEON optimizations for blend functions on ARM
- NEON optimizations for w_mask functions on ARM
- NEON optimizations for inverse transforms on ARM64
- Improve handling of malloc failures
- Simple Player example in tools
Changes for 0.3.1 'Sailfish':
------------------------------
......
![dav1d logo](dav1d_logo.png)
# dav1d
**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
......@@ -76,7 +78,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Run tests
1. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true`
1. During initial build dir setup or `meson configure` specify `-Denable_tests=true`
2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
for checkasm
......@@ -87,10 +89,10 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
```
git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
```
2. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true` and `-Dtestdata_tests=true`
2. During initial build dir setup or `meson configure` specify `-Denable_tests=true` and `-Dtestdata_tests=true`
```
meson .test -Dbuild_tests=true -Dtestdata_tests=true
meson .test -Denable_tests=true -Dtestdata_tests=true
```
3. In the build directory run `meson test` optionally with `-v` for more verbose output
......
......@@ -16,4 +16,13 @@ The Alliance for Open Media (AOM) for funding this project.
And all the dav1d Authors (git shortlog -sn), including:
Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal.
Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer,
Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet,
David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge,
Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang,
Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring,
Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede,
Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent,
Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen,
Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier,
Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth.
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 892.555 241.469"><defs><style>.a,.b,.c,.d,.e,.f,.g{fill-rule:evenodd;}.a{fill:url(#a);}.b{fill:url(#b);}.c{fill:url(#c);}.d{fill:url(#d);}.e{fill:url(#e);}.f{fill:url(#f);}.g{fill:#ec7f38;}.h{fill:#e9800b;}.i{fill:#1c1c1e;}</style><linearGradient id="a" x1="-4.141" y1="797.831" x2="-3.832" y2="797.831" gradientTransform="matrix(0, 93.772, 93.772, 0, -74468.833, 665.048)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#f38200"/><stop offset="1" stop-color="#e33b00"/></linearGradient><linearGradient id="b" x1="-0.193" y1="799.854" x2="0.116" y2="799.854" gradientTransform="matrix(0, 259.375, 259.375, 0, -207116.812, 249.437)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#d86722"/><stop offset="1" stop-color="#faa000"/></linearGradient><radialGradient id="c" cx="0.155" cy="801.934" r="0.33" gradientTransform="matrix(344.397, 0, 0, -344.397, 294.508, 276388.851)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#e25600"/><stop offset="1" stop-color="#e25900" stop-opacity="0"/></radialGradient><linearGradient id="d" x1="1.014" y1="800.555" x2="1.323" y2="800.555" gradientTransform="matrix(0, 667.187, 667.187, 0, -533774.755, -619.44)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#faa000"/><stop offset="1" stop-color="#d85f15"/></linearGradient><linearGradient id="e" x1="0.612" y1="799.09" x2="0.921" y2="799.09" gradientTransform="matrix(0, 155.49, 155.49, 0, -123905.179, -6.855)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#edeef0"/><stop offset="1" stop-color="#d1d3d5"/></linearGradient><linearGradient id="f" x1="-0.24" y1="799.463" x2="0.069" y2="799.463" gradientTransform="matrix(0, 193.148, 193.148, 0, -154069.718, 214.703)" xlink:href="#e"/></defs><title>dav1d</title><path class="a" d="M245.519,277.146a.726.726,0,0,1,.8-.741,24.866,24.866,0,0,1,8.143-1.615l178.01.755c2.341.01,6.131.214,8.448.5,0,0,4.411.311,4.411,1.1a17.347,17.347,0,0,1-.221,2.132,4.979,4.979,0,0,1-4.807,4.2H250.547a4.977,4.977,0,0,1-4.807-4.2A17.347,17.347,0,0,1,245.519,277.146Z" transform="translate(-43.722 -44.266)"/><path class="b" d="M417.643,199.5H273.205a6.1,6.1,0,0,0-5.494,4.07L245.7,275.547a2.971,2.971,0,0,0,3.016,4.069h193.42a2.971,2.971,0,0,0,3.016-4.069l-22.012-71.978A6.106,6.106,0,0,0,417.643,199.5Z" transform="translate(-43.722 -44.266)"/><path class="c" d="M417.643,199.5H273.205a6.1,6.1,0,0,0-5.494,4.07L245.7,275.547a2.971,2.971,0,0,0,3.016,4.069h193.42a2.971,2.971,0,0,0,3.016-4.069l-22.012-71.978A6.106,6.106,0,0,0,417.643,199.5Z" transform="translate(-43.722 -44.266)"/><path class="d" d="M345.409,263.207c-65.958,0-65.526-30.6-65.526-30.6a16.462,16.462,0,0,1,.506-6.082L331.908,62.461a6.91,6.91,0,0,1,3.872-4.056s1.623-1.282,9.945-1.282,9.4,1.213,9.4,1.213a7.372,7.372,0,0,1,3.835,4.125l51.47,164.064a15.127,15.127,0,0,1,.506,6.082S411.368,263.207,345.409,263.207Z" transform="translate(-43.722 -44.266)"/><path class="e" d="M323.571,89.008a63.831,63.831,0,0,0,21.068,4.024,57.933,57.933,0,0,0,22.411-4.786l12.293,39.187c-3.4,3.207-12.52,8.842-34.7,8.842-21.625,0-30.077-5.355-33.211-8.595Z" transform="translate(-43.722 -44.266)"/><path class="f" d="M298.33,169.392C300.473,172.8,310.156,184.1,344.4,184.1c35.433,0,45.78-12.7,47.769-15.785l11.675,37.214c-2.27,4.578-14.039,22.447-59.444,22.447-43.386,0-54.973-16.317-57.632-21.761Z" transform="translate(-43.722 -44.266)"/><path class="g" d="M357.132,59.691c-1.31,1.574-6.055,2.741-11.708,2.741-5.636,0-10.369-1.159-11.695-2.726a5.631,5.631,0,0,1,2.051-1.3s1.623-1.282,9.945-1.282,9.4,1.213,9.4,1.213A6.418,6.418,0,0,1,357.132,59.691Z" transform="translate(-43.722 -44.266)"/><path class="h" d="M638.853,57.806h86.119a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H680.778a6.6,6.6,0,0,1-6.6-6.6V110.18a6.6,6.6,0,0,0-6.6-6.6H626.174a6.6,6.6,0,0,1-6.146-8.987L632.706,62.01A6.6,6.6,0,0,1,638.853,57.806Z" transform="translate(-43.722 -44.266)"/><path class="i" d="M826.988,107.454c16.982,0,31.481,4.969,42.836,14.117a6.584,6.584,0,0,0,10.68-5.17V50.861a6.6,6.6,0,0,1,6.6-6.595h42.583a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H887.1a6.6,6.6,0,0,1-6.6-6.6h0a6.586,6.586,0,0,0-10.729-5.129c-11.113,9.061-25.076,13.981-41.5,13.981-49,0-81.243-36.107-81.243-89.946C747.035,142.594,778.951,107.454,826.988,107.454Zm14.83,135.08c23.212,0,38.686-18.7,38.686-45.457s-15.474-45.134-38.686-45.134-38.687,18.376-39.009,45.134C803.131,223.836,818.606,242.534,841.818,242.534Z" transform="translate(-43.722 -44.266)"/><path class="i" d="M123.675,107.454c16.982,0,31.481,4.969,42.836,14.117a6.584,6.584,0,0,0,10.68-5.17V50.861a6.6,6.6,0,0,1,6.6-6.595h42.581a6.6,6.6,0,0,1,6.6,6.595V276.882a6.6,6.6,0,0,1-6.6,6.6H183.787a6.6,6.6,0,0,1-6.6-6.6h0a6.586,6.586,0,0,0-10.729-5.129c-11.113,9.061-25.076,13.981-41.5,13.981-49,0-81.243-36.107-81.243-89.946C43.722,142.594,75.639,107.454,123.675,107.454Zm14.83,135.08c23.212,0,38.686-18.7,38.686-45.457s-15.474-45.134-38.686-45.134S99.818,170.319,99.5,197.077C99.818,223.836,115.293,242.534,138.505,242.534Z" transform="translate(-43.722 -44.266)"/><path class="h" d="M532.87,279.286,616.05,66.805a6.6,6.6,0,0,0-6.142-9H573.477A6.6,6.6,0,0,0,567.335,62L508.5,212.286a6.6,6.6,0,0,1-12.283,0L437.386,62a6.6,6.6,0,0,0-6.142-4.191H394.813a6.6,6.6,0,0,0-6.142,9L471.85,279.286a6.6,6.6,0,0,0,6.142,4.192h48.737A6.594,6.594,0,0,0,532.87,279.286Z" transform="translate(-43.722 -44.266)"/></svg>
......@@ -46,7 +46,7 @@
/* x86-64 needs 32-byte alignment for AVX2. */
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
......
......@@ -40,6 +40,8 @@ typedef void coef;
#elif BITDEPTH == 8
typedef uint8_t pixel;
typedef int16_t coef;
#define PIXEL_TYPE uint8_t
#define COEF_TYPE int16_t
#define pixel_copy memcpy
#define pixel_set memset
#define iclip_pixel iclip_u8
......@@ -54,6 +56,8 @@ typedef int16_t coef;
#elif BITDEPTH == 16
typedef uint16_t pixel;
typedef int32_t coef;
#define PIXEL_TYPE uint16_t
#define COEF_TYPE int32_t
#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
static inline void pixel_set(pixel *const dst, const int val, const int num) {
for (int n = 0; n < num; n++)
......
......@@ -45,19 +45,25 @@ static inline void append_plane_to_file(const pixel *buf, ptrdiff_t stride,
fclose(f);
}
static inline void hex_dump(const pixel *buf, ptrdiff_t stride,
int w, int h, const char *what)
static inline void hex_fdump(FILE *out, const pixel *buf, ptrdiff_t stride,
int w, int h, const char *what)
{
printf("%s\n", what);
fprintf(out, "%s\n", what);
while (h--) {
int x;
for (x = 0; x < w; x++)
printf(" " PIX_HEX_FMT, buf[x]);
fprintf(out, " " PIX_HEX_FMT, buf[x]);
buf += PXSTRIDE(stride);
printf("\n");
fprintf(out, "\n");
}
}
static inline void hex_dump(const pixel *buf, ptrdiff_t stride,
int w, int h, const char *what)
{
hex_fdump(stdout, buf, stride, w, h, what);
}
static inline void coef_dump(const coef *buf, const int w, const int h,
const int len, const char *what)
{
......
......@@ -64,6 +64,8 @@ typedef struct Dav1dSettings {
int apply_grain;
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
uint8_t reserved[32]; ///< reserved for future use
Dav1dPicAllocator allocator;
Dav1dLogger logger;
} Dav1dSettings;
......@@ -142,7 +144,7 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
* passed-in arguments.
*
* @note To drain buffered frames from the decoder (i.e. on end of stream),
* call this function until it returns -EAGAIN.
* call this function until it returns DAV1D_ERR(EAGAIN).
*
* @code{.c}
* Dav1dData data = { 0 };
......@@ -155,11 +157,11 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
* // Keep going even if the function can't consume the current data
* packet. It eventually will after one or more frames have been
* returned in this loop.
* if (res < 0 && res != -EAGAIN)
* if (res < 0 && res != DAV1D_ERR(EAGAIN))
* free_and_abort();
* res = dav1d_get_picture(c, &p);
* if (res < 0) {
* if (res != -EAGAIN)
* if (res != DAV1D_ERR(EAGAIN))
* free_and_abort();
* } else
* output_and_unref_picture(&p);
......@@ -170,7 +172,7 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
* do {
* res = dav1d_get_picture(c, &p);
* if (res < 0) {
* if (res != -EAGAIN)
* if (res != DAV1D_ERR(EAGAIN))
* free_and_abort();
* } else
* output_and_unref_picture(&p);
......
......@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.3.1',
version: '0.4.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '1.1.0'
dav1d_soname_version = '2.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
......@@ -61,14 +61,15 @@ foreach bitdepth : ['8', '16']
endforeach
# ASM option
is_asm_enabled = (get_option('build_asm') == true and
is_asm_enabled = (get_option('enable_asm') == true and
(host_machine.cpu_family().startswith('x86') or
host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')))
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le'))
cdata.set10('HAVE_ASM', is_asm_enabled)
if is_asm_enabled and get_option('b_sanitize') == 'memory'
error('asm causes false positive with memory sanitizer. Use \'-Dbuild_asm=false\'.')
error('asm causes false positive with memory sanitizer. Use \'-Denable_asm=false\'.')
endif
# Logging option
......@@ -160,7 +161,8 @@ elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_arg
endif
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm'))
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le')
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_GETAUXVAL', 1)
endif
......@@ -180,16 +182,23 @@ endif
# Compiler flags that should be set
# But when the compiler does not supports them
# it is not an error and silently tolerated
optional_arguments += [
'-Wundef',
'-Werror=vla',
'-Wno-maybe-uninitialized',
'-Wno-missing-field-initializers',
'-Wno-unused-parameter',
'-Werror=missing-prototypes',
'-Wshorten-64-to-32',
]
if cc.get_id() == 'msvc'
if cc.get_id() != 'msvc'
optional_arguments += [
'-Wundef',
'-Werror=vla',
'-Wno-maybe-uninitialized',
'-Wno-missing-field-initializers',
'-Wno-unused-parameter',
'-Werror=missing-prototypes',
'-Wshorten-64-to-32',
]
if host_machine.cpu_family() == 'x86'
optional_arguments += [
'-msse2',
'-mfpmath=sse',
]
endif
else
optional_arguments += [
'-wd4028', # parameter different from declaration
'-wd4996' # use of POSIX functions
......@@ -286,26 +295,18 @@ if (is_asm_enabled and
endif
endif
cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
if host_machine.cpu_family().startswith('x86')
cdata.set10('ARCH_X86', true)
if host_machine.cpu_family() == 'x86_64'
cdata_asm.set10('ARCH_X86_64', true)
cdata.set10('ARCH_X86_64', true)
cdata_asm.set10('ARCH_X86_32', false)
cdata.set10('ARCH_X86_32', false)
else
cdata_asm.set10('ARCH_X86_64', false)
cdata.set10('ARCH_X86_64', false)
cdata_asm.set10('ARCH_X86_32', true)
cdata.set10('ARCH_X86_32', true)
cdata_asm.set10('PIC', true)
endif
else
cdata.set10('ARCH_X86', false)
cdata.set10('ARCH_X86_64', false)
cdata.set10('ARCH_X86_32', false)
cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
cdata_asm.set10('PIC', true)
endif
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
if cc.symbols_have_underscore_prefix()
cdata.set10('PREFIX', true)
cdata_asm.set10('PREFIX', true)
......@@ -332,6 +333,11 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
# check NASM version
if nasm.found()
nasm_r = run_command(nasm, '-v')
if nasm_r.returncode() != 0
error('failed running nasm to obtain its version')
endif
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13.02')
......
......@@ -5,17 +5,17 @@ option('bitdepths',
choices: ['8', '16'],
description: 'Enable only specified bitdepths')
option('build_asm',
option('enable_asm',
type: 'boolean',
value: true,
description: 'Build asm files, if available')
option('build_tools',
option('enable_tools',
type: 'boolean',
value: true,
description: 'Build dav1d cli tools')
option('build_tests',
option('enable_tests',
type: 'boolean',
value: true,
description: 'Build dav1d tests')
......@@ -36,6 +36,10 @@ option('fuzzing_engine',
value: 'none',
description: 'Select the fuzzing engine')
option('fuzzer_ldflags',
type: 'string',
description: 'Extra LDFLAGS used during linking of fuzzing binaries')
option('stack_alignment',
type: 'integer',
value: 0)
......@@ -215,6 +215,637 @@ bidir_fn w_avg
bidir_fn mask
.macro w_mask_fn type
function w_mask_\type\()_8bpc_neon, export=1
push {r4-r10,lr}
ldr r4, [sp, #32]
ldr r5, [sp, #36]
ldr r6, [sp, #40]
ldr r7, [sp, #44]
clz r8, r4
adr r9, L(w_mask_\type\()_tbl)
sub r8, r8, #24
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
mov r12, #6903
vdup.16 q14, r12
.if \type == 444
vmov.i8 q15, #64
.elseif \type == 422
vdup.8 d0, r7 // d0[] <- sign
vmov.i8 d30, #129
vsub.i8 d30, d30, d0 // 129 - sign
.elseif \type == 420
vdup.16 q0, r7 // d0[] <- sign
vmov.i16 q15, #256
vsub.i16 q15, q15, q0 // 256 - sign
.endif
add r12, r0, r1
lsl r1, r1, #1
bx r9
.align 2
L(w_mask_\type\()_tbl):
.word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
4:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1 (four rows at once)
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2 (four rows at once)
subs r5, r5, #4
vsub.i16 q8, q2, q0 // tmp2-tmp1
vsub.i16 q9, q3, q1
vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x]))
vabd.s16 q11, q1, q3
vqsub.u16 q10, q14, q10 // 6903 - abs ()
vqsub.u16 q11, q14, q11
vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8
vshr.s16 q11, q11, #8
vshl.s16 q12, q10, #9 // (64-m)<<9
vshl.s16 q13, q11, #9
vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15
vqdmulh.s16 q13, q13, q9
vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1
vadd.i16 q13, q13, q1
vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4
vqrshrun.s16 d25, q13, #4
.if \type == 444
vmovn.u16 d20, q10 // 64 - m
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // m
vst1.8 {d20, d21}, [r6]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
vpadd.s16 d21, d22, d23
vmovn.s16 d6, q10
vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
vst1.8 {d6}, [r6]!
.elseif \type == 420
vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
vadd.s16 d21, d22, d23
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.32 {d20[0]}, [r6]!
.endif
vst1.32 {d24[0]}, [r0], r1
vst1.32 {d24[1]}, [r12], r1
vst1.32 {d25[0]}, [r0], r1
vst1.32 {d25[1]}, [r12], r1
bgt 4b
pop {r4-r10,pc}
8:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1, tmp1y2
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1, tmp2y2
subs r5, r5, #2
vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1)
vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2)
vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2)
vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8
vshl.s16 q12, q10, #9 // (64 - my1) << 9
vshl.s16 q13, q11, #9 // (64 - my2) << 9
vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
.if \type == 444
vmovn.u16 d20, q10 // 64 - m
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // m
vst1.8 {d20, d21}, [r6]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
vmovn.s16 d20, q10
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
vst1.8 {d20}, [r6]!
.elseif \type == 420
vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.32 {d20[0]}, [r6]!
.endif
vst1.16 {d24}, [r0], r1
vst1.16 {d25}, [r12], r1
bgt 8b
pop {r4-r10,pc}
1280:
640:
320:
160:
sub r1, r1, r4
.if \type == 444
add r10, r6, r4
.elseif \type == 422
add r10, r6, r4, lsr #1
.endif
mov lr, r7
add r9, r3, r4, lsl #1
add r7, r2, r4, lsl #1
161:
mov r8, r4
16:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1
vld1.16 {d16, d17, d18, d19}, [r7]! // tmp1y2
subs r8, r8, #16
vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
vsub.i16 q3, q3, q1
vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1)
vabs.s16 q11, q3
vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
vqsub.u16 q11, q14, q11
vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
vshr.s16 q11, q11, #8
vshl.s16 q12, q10, #9 // (64 - my1) << 9
vshl.s16 q13, q11, #9
vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
vqdmulh.s16 q13, q13, q3
vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
vadd.i16 q13, q13, q1
vld1.16 {d0, d1, d2, d3}, [r9]! // tmp2h2
.if \type == 444
vmovn.u16 d20, q10 // 64 - my1
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // my1
vst1.8 {d20, d21}, [r6]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
vpadd.s16 d21, d22, d23
vmovn.s16 d20, q10
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
vst1.8 {d20}, [r6]!
.endif
vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
vqrshrun.s16 d25, q13, #4
vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
vsub.i16 q1, q1, q9
vst1.16 {d24, d25}, [r0]! // store dsty1
vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
vabs.s16 q3, q1
vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
vqsub.u16 q3, q14, q3
vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8
vshr.s16 q3, q3, #8
vshl.s16 q12, q2, #9 // (64 - my2) << 9
vshl.s16 q13, q3, #9
.if \type == 444
vmovn.u16 d4, q2 // 64 - my2
vmovn.u16 d5, q3
vsub.i8 q2, q15, q2 // my2
vst1.8 {d4, d5}, [r10]!
.elseif \type == 422
vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
vpadd.s16 d5, d6, d7
vmovn.s16 d4, q2
vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
vst1.8 {d4}, [r10]!
.elseif \type == 420
vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
vadd.s16 q11, q11, q3
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vpadd.s16 d21, d22, d23
vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.8 {d20}, [r6]!
.endif
vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
vqdmulh.s16 q13, q13, q1
vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
vadd.i16 q13, q13, q9
vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
vqrshrun.s16 d25, q13, #4
vst1.16 {d24, d25}, [r12]! // store dsty2
bgt 16b
subs r5, r5, #2
add r2, r2, r4, lsl #1
add r3, r3, r4, lsl #1
add r7, r7, r4, lsl #1
add r9, r9, r4, lsl #1
.if \type == 444
add r6, r6, r4
add r10, r10, r4
.elseif \type == 422
add r6, r6, r4, lsr #1
add r10, r10, r4, lsr #1
.endif
add r0, r0, r1
add r12, r12, r1
bgt 161b
pop {r4-r10,pc}
endfunc
.endm
w_mask_fn 444
w_mask_fn 422
w_mask_fn 420
function blend_8bpc_neon, export=1
push {r4-r5,lr}
ldr r4, [sp, #12]
ldr r5, [sp, #16]
clz lr, r3
adr r3, L(blend_tbl)
sub lr, lr, #26
ldr lr, [r3, lr, lsl #2]
add r3, r3, lr
bx r3
.align 2
L(blend_tbl):
.word 320f - L(blend_tbl) + CONFIG_THUMB
.word 160f - L(blend_tbl) + CONFIG_THUMB
.word 80f - L(blend_tbl) + CONFIG_THUMB
.word 40f - L(blend_tbl) + CONFIG_THUMB
40:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
4:
vld1.u8 {d2}, [r5]!
vld1.u8 {d1}, [r2]!
vld1.32 {d0[]}, [r0]
subs r4, r4, #2
vld1.32 {d0[1]}, [r12]
vsub.i8 d3, d22, d2
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d3
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
vmov.i8 d16, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld1.u8 {q1}, [r5]!
vld1.u8 {q2}, [r2]!
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vsub.i8 d18, d16, d3
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmull.u8 q10, d3, d5
vmlal.u8 q10, d1, d18
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
bgt 8b
pop {r4-r5,pc}
160:
vmov.i8 q12, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld1.u8 {q1, q2}, [r5]!
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vsub.i8 q15, q12, q1
vld1.u8 {q13}, [r12]
vmull.u8 q3, d16, d2
vmlal.u8 q3, d0, d30
vmull.u8 q14, d17, d3
vmlal.u8 q14, d1, d31
vsub.i8 q15, q12, q2
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q14, #6
vmull.u8 q3, d18, d4
vmlal.u8 q3, d26, d30
vmull.u8 q14, d19, d5
vmlal.u8 q14, d27, d31
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q14, #6
vst1.u8 {q10}, [r0], r1
vst1.u8 {q11}, [r12], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
32:
vld1.u8 {q2, q3}, [r5]!
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vsub.i8 q11, q10, q2
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
vmull.u8 q14, d17, d5
vmlal.u8 q14, d1, d23
vsub.i8 q11, q10, q3
vrshrn.i16 d24, q15, #6
vrshrn.i16 d25, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d22
vmull.u8 q14, d19, d7
vmlal.u8 q14, d3, d23
vrshrn.i16 d26, q15, #6
vrshrn.i16 d27, q14, #6
vst1.u8 {q12, q13}, [r0], r1
bgt 32b
pop {r4-r5,pc}
endfunc
function blend_h_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
movrel r5, X(obmc_masks)
add r5, r5, r4
sub r4, r4, r4, lsr #2
clz r6, r3
adr r7, L(blend_h_tbl)
sub r6, r6, #24
ldr r6, [r7, r6, lsl #2]
add r7, r7, r6
bx r7
.align 2
L(blend_h_tbl):
.word 1280f - L(blend_h_tbl) + CONFIG_THUMB
.word 640f - L(blend_h_tbl) + CONFIG_THUMB
.word 320f - L(blend_h_tbl) + CONFIG_THUMB
.word 160f - L(blend_h_tbl) + CONFIG_THUMB
.word 80f - L(blend_h_tbl) + CONFIG_THUMB
.word 40f - L(blend_h_tbl) + CONFIG_THUMB
.word 20f - L(blend_h_tbl) + CONFIG_THUMB
20:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
2:
vld1.16 {d2[], d3[]}, [r5]!
vld1.32 {d1[0]}, [r2]!
subs r4, r4, #2
vld1.16 {d0[]}, [r0]
vzip.8 d2, d3
vsub.i8 d4, d22, d2
vld1.16 {d0[1]}, [r12]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d4
vrshrn.i16 d20, q8, #6
vst1.16 {d20[0]}, [r0], r1
vst1.16 {d20[1]}, [r12], r1
bgt 2b
pop {r4-r8,pc}
40:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
4:
vld2.u8 {d2[], d3[]}, [r5]!
vld1.u8 {d1}, [r2]!
subs r4, r4, #2
vext.u8 d2, d2, d3, #4
vld1.32 {d0[]}, [r0]
vsub.i8 d6, d22, d2
vld1.32 {d0[1]}, [r12]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d6
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
bgt 4b
pop {r4-r8,pc}
80:
vmov.i8 q8, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld2.u8 {d2[], d3[]}, [r5]!
vld1.u8 {d4, d5}, [r2]!
vld1.u8 {d0}, [r0]
vsub.i8 q9, q8, q1
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d18
vmull.u8 q10, d3, d5
vmlal.u8 q10, d1, d19
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
bgt 8b
pop {r4-r8,pc}
160:
vmov.i8 q12, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld2.u8 {d28[], d29[]}, [r5]!
vld1.u8 {d2, d3, d4, d5}, [r2]!
vsub.i8 q15, q12, q14
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d28
vmlal.u8 q3, d0, d30
vmull.u8 q8, d3, d28
vmlal.u8 q8, d1, d30
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d4, d29
vmlal.u8 q3, d26, d31
vmull.u8 q8, d5, d29
vmlal.u8 q8, d27, d31
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
bgt 16b
pop {r4-r8,pc}
320:
640:
1280:
vmov.i8 d20, #64
sub r1, r1, r3
321:
vld1.u8 {d6[]}, [r5]!
vsub.i8 d7, d20, d6
mov r8, r3
32:
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
vmull.u8 q15, d16, d6
vmlal.u8 q15, d0, d7
vmull.u8 q14, d17, d6
vmlal.u8 q14, d1, d7
vrshrn.i16 d0, q15, #6
vrshrn.i16 d1, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d7
vmull.u8 q14, d19, d6
vmlal.u8 q14, d3, d7
vrshrn.i16 d2, q15, #6
vrshrn.i16 d3, q14, #6
vst1.u8 {q0, q1}, [r0]!
subs r8, r8, #32
bgt 32b
add r0, r0, r1
subs r4, r4, #1
bgt 321b
pop {r4-r8,pc}
endfunc
function blend_v_8bpc_neon, export=1
push {r4-r5,lr}
ldr r4, [sp, #12]
movrel r5, X(obmc_masks)
add r5, r5, r3
clz lr, r3
adr r3, L(blend_v_tbl)
sub lr, lr, #26
ldr lr, [r3, lr, lsl #2]
add r3, r3, lr
bx r3
.align 2
L(blend_v_tbl):
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
.word 160f - L(blend_v_tbl) + CONFIG_THUMB
.word 80f - L(blend_v_tbl) + CONFIG_THUMB
.word 40f - L(blend_v_tbl) + CONFIG_THUMB
.word 20f - L(blend_v_tbl) + CONFIG_THUMB
20:
vmov.i8 d22, #64
vld1.8 {d2[]}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d3, d22, d2
2:
vld1.16 {d1[0]}, [r2]!
vld1.8 {d0[]}, [r0]
subs r4, r4, #2
vld1.8 {d1[1]}, [r2]
vld1.8 {d0[1]}, [r12]
vmull.u8 q2, d1, d2
vmlal.u8 q2, d0, d3
vrshrn.i16 d6, q2, #6
add r2, r2, #2
vst1.8 {d6[0]}, [r0], r1
vst1.8 {d6[1]}, [r12], r1
bgt 2b
pop {r4-r5,pc}
40:
vmov.i8 d22, #64
vld1.32 {d4[]}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
4:
vld1.u8 {d2}, [r2]!
vld1.32 {d0[]}, [r0]
vld1.32 {d0[1]}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d5
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0]!
vst1.16 {d20[2]}, [r12]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
bgt 4b
pop {r4-r5,pc}
80:
vmov.i8 d16, #64
vld1.u8 {d2}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
8:
vld1.u8 {d4, d5}, [r2]!
vld1.u8 {d0}, [r0]
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmull.u8 q10, d2, d5
vmlal.u8 q10, d1, d17
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0]!
vst1.32 {d23[0]}, [r12]!
vst1.16 {d22[2]}, [r0]!
vst1.16 {d23[2]}, [r12]!
add r0, r0, r1
add r12, r12, r1
bgt 8b
pop {r4-r5,pc}
160:
vmov.i8 q12, #64
vld1.u8 {q14}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
16:
vld1.u8 {q1, q2}, [r2]!
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d28
vmlal.u8 q3, d0, d22
vmull.u8 q8, d3, d29
vmlal.u8 q8, d1, d23
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d4, d28
vmlal.u8 q3, d26, d22
vmull.u8 q8, d5, d29
vmlal.u8 q8, d27, d23
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0]!
vst1.u8 {d20}, [r12]!
vst1.32 {d19[0]}, [r0]!
vst1.32 {d21[0]}, [r12]!
add r0, r0, r1
add r12, r12, r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
32:
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
vmull.u8 q14, d17, d5
vmlal.u8 q14, d1, d23
vrshrn.i16 d0, q15, #6
vrshrn.i16 d1, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d24
vrshrn.i16 d2, q15, #6
vst1.u8 {d0, d1, d2}, [r0], r1
bgt 32b
pop {r4-r5,pc}
endfunc
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (clz(w)-24).
......@@ -1112,7 +1743,7 @@ L(\type\()_8tap_v_tbl):
vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
b 48b
bgt 48b
0:
vpop {q4}
pop {r4-r11,pc}
......@@ -1145,7 +1776,7 @@ L(\type\()_8tap_v_tbl):
0:
pop {r4-r11,pc}
880: // 8x8, 8x16, 8x32 v
880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
......@@ -1178,12 +1809,17 @@ L(\type\()_8tap_v_tbl):
mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
shift_store_8 \type, \d_strd, q3, d6, q4, d8
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d30, d2, d4, d6
vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d30, d2
vmovl_u8 q15, d30, q1, d2
mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
shift_store_8 \type, \d_strd, q8, d16, q9, d18
ble 9f
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d4, d6
vmovl_u8 q2, d4, q3, d6
mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22
shift_store_8 \type, \d_strd, q10, d20, q11, d22
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
......
This diff is collapsed.
......@@ -1119,7 +1119,7 @@ L(\type\()_8tap_v):
uxtl_b v18, v19, v20, v21
mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b 48b
b.gt 48b
0:
ret
......@@ -1151,7 +1151,7 @@ L(\type\()_8tap_v):
0:
ret
880: // 8x8, 8x16, 8x32 v
880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
......@@ -1183,12 +1183,17 @@ L(\type\()_8tap_v):
mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v27, v16, v17, v18
uxtl_b v27, v16, v17, v18
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v1, v2, v3, v4
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
......
......@@ -192,7 +192,7 @@ function msac_decode_symbol_adapt4_neon, export=1
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6.8h, w4 // -rate
sub w3, w3, w3, lsr #5 // count + (count >= 32)
sub w3, w3, w3, lsr #5 // count - (count >= 32)
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
add w3, w3, #1 // count + (count < 32)
......@@ -215,6 +215,7 @@ L(renorm):
eor w5, w5, #16 // d = clz(rng) ^ 16
mvn x7, x7 // ~dif
add x7, x7, x3, lsl #48 // ~dif + (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (~dif + (v << 48)) << d
......@@ -278,3 +279,86 @@ function msac_decode_symbol_adapt16_neon, export=1
decode_update .8h, .16b, 16
b L(renorm)
endfunc
function msac_decode_bool_equi_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
bic w4, w5, #0xff // r &= 0xff00
add w4, w4, #8
subs x8, x7, x4, lsl #47 // dif - vw
lsr w4, w4, #1 // v
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
function msac_decode_bool_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
bic w1, w1, #0x3f // f &= ~63
mul w4, w4, w1
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
function msac_decode_bool_adapt_neon, export=1
ldr w9, [x1] // cdf[0-1]
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
and w2, w9, #0xffc0 // f &= ~63
mul w4, w4, w2
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
lsr w2, w9, #16 // count = cdf[1]
and w9, w9, #0xffff // cdf[0]
sub w3, w2, w2, lsr #5 // count - (count >= 32)
lsr w2, w2, #4 // count >> 4
add w10, w3, #1 // count + (count < 32)
add w2, w2, #4 // rate = (count >> 4) | 4
sub w9, w9, w15 // cdf[0] -= bit
sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
sub w9, w9, w11 // cdf[0]
strh w9, [x1]
strh w10, [x1, #2]
b L(renorm2)
endfunc
......@@ -59,9 +59,9 @@
#endif
.endm
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8b, \r0\().8b, \r1\().8b
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().8b, \r0\().8b, \r1\().8b
trn2 \t9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
......@@ -73,24 +73,53 @@
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r5\().4h, \t9\().4h, \r3\().4h
trn2 \t9\().4h, \t9\().4h, \r3\().4h
trn1 \r3\().4h, \t8\().4h, \r1\().4h
trn2 \t8\().4h, \t8\().4h, \r1\().4h
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2s, \r9\().2s, \r7\().2s
trn2 \r6\().2s, \t8\().2s, \r2\().2s
trn1 \r2\().2s, \t8\().2s, \r2\().2s
trn1 \r3\().2s, \t9\().2s, \r7\().2s
trn2 \r7\().2s, \t9\().2s, \r7\().2s
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().16b, \r0\().16b, \r1\().16b
trn2 \r9\().16b, \r0\().16b, \r1\().16b
.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().8h, \r0\().8h, \r1\().8h
trn2 \t9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \t9\().4s, \r3\().4s
trn2 \t9\().4s, \t9\().4s, \r3\().4s
trn1 \r3\().4s, \t8\().4s, \r1\().4s
trn2 \t8\().4s, \t8\().4s, \r1\().4s
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2d, \t8\().2d, \r2\().2d
trn1 \r2\().2d, \t8\().2d, \r2\().2d
trn1 \r3\().2d, \t9\().2d, \r7\().2d
trn2 \r7\().2d, \t9\().2d, \r7\().2d
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
......@@ -102,19 +131,19 @@
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \r9\().8h, \r3\().8h
trn2 \r9\().8h, \r9\().8h, \r3\().8h
trn1 \r3\().8h, \r8\().8h, \r1\().8h
trn2 \r8\().8h, \r8\().8h, \r1\().8h
trn1 \r5\().8h, \t9\().8h, \r3\().8h
trn2 \t9\().8h, \t9\().8h, \r3\().8h
trn1 \r3\().8h, \t8\().8h, \r1\().8h
trn2 \t8\().8h, \t8\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \r8\().4s, \r2\().4s
trn1 \r2\().4s, \r8\().4s, \r2\().4s
trn1 \r3\().4s, \r9\().4s, \r7\().4s
trn2 \r7\().4s, \r9\().4s, \r7\().4s
trn2 \r6\().4s, \t8\().4s, \r2\().4s
trn1 \r2\().4s, \t8\().4s, \r2\().4s
trn1 \r3\().4s, \t9\().4s, \r7\().4s
trn2 \r7\().4s, \t9\().4s, \r7\().4s
.endm
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
......@@ -129,4 +158,28 @@
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().4h, \r0\().4h, \r1\().4h
trn2 \t5\().4h, \r0\().4h, \r1\().4h
trn1 \t6\().4h, \r2\().4h, \r3\().4h
trn2 \t7\().4h, \r2\().4h, \r3\().4h
trn1 \r0\().2s, \t4\().2s, \t6\().2s
trn2 \r2\().2s, \t4\().2s, \t6\().2s
trn1 \r1\().2s, \t5\().2s, \t7\().2s
trn2 \r3\().2s, \t5\().2s, \t7\().2s
.endm
.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
......@@ -37,6 +37,7 @@
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
#endif
#ifdef _WIN32
......
......@@ -31,7 +31,9 @@
#include "src/arm/cpu.h"
#if defined(HAVE_GETAUXVAL) && ARCH_ARM
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
// NEON is always available; runtime tests are not needed.
#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
#include <sys/auxv.h>
#ifndef HWCAP_ARM_NEON
......@@ -77,9 +79,7 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = 0;
#if ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#elif defined(__ARM_NEON)
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
unsigned long hw_cap = getauxval(AT_HWCAP);
......@@ -90,10 +90,6 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
#elif defined(__ANDROID__)
flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
#elif defined(__APPLE__)
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#elif defined(_WIN32)
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#endif
return flags;
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/itx.h"
#define decl_itx2_fns(w, h, opt) \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
#define decl_itx12_fns(w, h, opt) \
decl_itx2_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
#define decl_itx16_fns(w, h, opt) \
decl_itx12_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
#define decl_itx17_fns(w, h, opt) \
decl_itx16_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
decl_itx17_fns( 4, 4, neon);
decl_itx16_fns( 4, 8, neon);
decl_itx16_fns( 4, 16, neon);
decl_itx16_fns( 8, 4, neon);
decl_itx16_fns( 8, 8, neon);
decl_itx16_fns( 8, 16, neon);
decl_itx2_fns ( 8, 32, neon);
decl_itx16_fns(16, 4, neon);
decl_itx16_fns(16, 8, neon);
decl_itx12_fns(16, 16, neon);
decl_itx2_fns (16, 32, neon);
decl_itx2_fns (32, 8, neon);
decl_itx2_fns (32, 16, neon);
decl_itx2_fns (32, 32, neon);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_neon);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_neon);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_neon);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_neon);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_neon);
COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
#define assign_itx1_fn(pfx, w, h, ext) \
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
#define assign_itx2_fn(pfx, w, h, ext) \
assign_itx1_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
#define assign_itx12_fn(pfx, w, h, ext) \
assign_itx2_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
#define assign_itx16_fn(pfx, w, h, ext) \
assign_itx12_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
#define assign_itx17_fn(pfx, w, h, ext) \
assign_itx16_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
assign_itx16_fn(R, 8, 4, neon);
assign_itx16_fn( , 8, 8, neon);
assign_itx16_fn(R, 8, 16, neon);
assign_itx2_fn (R, 8, 32, neon);
assign_itx16_fn(R, 16, 4, neon);
assign_itx16_fn(R, 16, 8, neon);
assign_itx12_fn( , 16, 16, neon);
assign_itx2_fn (R, 16, 32, neon);
assign_itx1_fn (R, 16, 64, neon);
assign_itx2_fn (R, 32, 8, neon);
assign_itx2_fn (R, 32, 16, neon);
assign_itx2_fn ( , 32, 32, neon);
assign_itx1_fn (R, 32, 64, neon);
assign_itx1_fn (R, 64, 16, neon);
assign_itx1_fn (R, 64, 32, neon);
assign_itx1_fn ( , 64, 64, neon);
#endif
}