...
 
Commits (18)
Changes for 0.2.1 'Antelope':
----------------------------
- SSSE3 optimization for cdef_dir
- AVX-2 improvements of the existing CDEF optimizations
- NEON improvements of the existing CDEF and wiener optimizations
- Clarification about the numbering/versionning scheme
Changes for 0.2.0 'Antelope':
----------------------------
......
......@@ -16,4 +16,4 @@ The Alliance for Open Media (AOM) for funding this project.
And all the dav1d Authors (git shortlog -sn), including:
Janne Grunau, Ronald S. Bultje, James Almer, Marvin Scholz, Henrik Gramner, Martin Storsjö, Luc Trudeau, David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Jean-Baptiste Kempf, Derek Buitenhuis, Nathan E. Egge, Raphaël Zumer, Francois Cartegnie, Niklas Haas, Konstantin Pavlov, Boyuan Xiao, Raphael Zumer and Michael Bradshaw.
Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal.
......@@ -73,16 +73,6 @@ typedef struct Dav1dSettings {
*/
DAV1D_API const char *dav1d_version(void);
/**
* Get library version based on version control system.
*/
DAV1D_API const char *dav1d_version_vcs(void);
/**
* Get library version as unsigned int.
*/
DAV1D_API unsigned int dav1d_version_int(void);
/**
* Initialize settings to default values.
*
......
......@@ -24,9 +24,9 @@
# installed version.h header generation
version_h_data = configuration_data()
version_h_data.set('DAV1D_VERSION_MAJOR', dav1d_version_major)
version_h_data.set('DAV1D_VERSION_MINOR', dav1d_version_minor)
version_h_data.set('DAV1D_VERSION_PATCH', dav1d_version_revision)
version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
version_h_target = configure_file(input: 'version.h.in',
output: 'version.h',
configuration: version_h_data)
......
......@@ -27,12 +27,8 @@
#ifndef DAV1D_VERSION_H
#define DAV1D_VERSION_H
#define DAV1D_VERSION_MAJOR @DAV1D_VERSION_MAJOR@
#define DAV1D_VERSION_MINOR @DAV1D_VERSION_MINOR@
#define DAV1D_VERSION_PATCH @DAV1D_VERSION_PATCH@
#define DAV1D_VERSION "@DAV1D_VERSION_MAJOR@.@DAV1D_VERSION_MINOR@.@DAV1D_VERSION_PATCH@"
#define DAV1D_VERSION_INT (@DAV1D_VERSION_MAJOR@ << 16 | @DAV1D_VERSION_MINOR@ << 8 | @DAV1D_VERSION_PATCH@)
#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
#endif /* DAV1D_VERSION_H */
/* auto-generated, do not edit */
#define DAV1D_VERSION_VCS "@VCS_TAG@"
#define DAV1D_VERSION "@VCS_TAG@"
......@@ -23,18 +23,18 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.2.0',
version: '0.2.1',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '1.0.0'
dav1d_version_array = dav1d_soname_version.split('.')
dav1d_version_major = dav1d_version_array[0]
dav1d_version_minor = dav1d_version_array[1]
dav1d_version_revision = dav1d_version_array[2]
dav1d_soname_version = '1.0.1'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
dav1d_api_version_revision = dav1d_api_version_array[2]
dav1d_src_root = meson.current_source_dir()
cc = meson.get_compiler('c')
......
......@@ -283,14 +283,12 @@ L(variable_shift_tbl):
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
44: // 4 pixels valid in d2/d16, fill d3/d17 with padding.
vmov d3, d4
vmov d17, d18
b 88f
// Shift q1 right, shifting out invalid pixels,
// shift q1 left to the original offset, shifting in padding pixels.
44: // 4 pixels valid
vext.8 q1, q1, q1, #8
vext.8 q1, q1, q2, #8
vext.8 q8, q8, q8, #8
vext.8 q8, q8, q9, #8
b 88f
55: // 5 pixels valid
vext.8 q1, q1, q1, #10
vext.8 q1, q1, q2, #6
......
......@@ -136,8 +136,7 @@
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
movi v30.16b, #255
ushr v30.8h, v30.8h, #1 // INT16_MAX
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w6, #4 // CDEF_HAVE_TOP
......@@ -290,29 +289,23 @@ endconst
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmeq v16.8h, \s1\().8h, v31.8h
cmeq v17.8h, \s2\().8h, v31.8h
bic v16.16b, \s1\().16b, v16.16b
bic v17.16b, \s2\().16b, v17.16b
umin v2.8h, v2.8h, \s1\().8h
umax v3.8h, v3.8h, v16.8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
umax v3.8h, v3.8h, v17.8h
smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
sub v17.8h, \thresh_vec, v17.8h // threshold - (abs(diff) >> shift)
sub v21.8h, \thresh_vec, v21.8h // threshold - (abs(diff) >> shift)
smax v17.8h, v29.8h, v17.8h // imax(0, threshold - ())
smax v21.8h, v29.8h, v21.8h // imax(0, threshold - ())
uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
cmhi v18.8h, v0.8h, \s1\().8h // px > p0
cmhi v22.8h, v0.8h, \s2\().8h // px > p1
smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
smin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
dup v19.8h, \tap // taps[k]/taps[k]
umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
dup v19.8h, \tap // taps[k]
neg v16.8h, v17.8h // -imin()
neg v20.8h, v21.8h // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
......@@ -332,11 +325,8 @@ function cdef_filter\w\()_neon, export=1
add x8, x8, w9, uxtw #1
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v31.16b, #255
movi v30.8h, #15
movi v29.8h, #0
dup v28.8h, w6 // damping
ushr v31.8h, v31.8h, #1 // INT16_MAX
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
......@@ -344,10 +334,8 @@ function cdef_filter\w\()_neon, export=1
clz v26.8h, v27.8h // clz(threshold)
sub v24.8h, v30.8h, v24.8h // ulog2(threshold)
sub v26.8h, v30.8h, v26.8h // ulog2(threshold)
sub v24.8h, v28.8h, v24.8h // damping - ulog2(threshold)
sub v26.8h, v28.8h, v26.8h // damping - ulog2(threshold)
smax v24.8h, v29.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
smax v26.8h, v29.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
neg v24.8h, v24.8h // -shift
neg v26.8h, v26.8h // -shift
......
......@@ -224,31 +224,25 @@ function wiener_filter_h_neon, export=1
mov v3.16b, v28.16b
mov v5.16b, v29.16b
br x11
44: // 4 pixels valid in v2/v4, fill the high half with padding.
ins v2.d[1], v3.d[0]
ins v4.d[1], v5.d[0]
b 88f
// Shift v2 right, shifting out invalid pixels,
// shift v2 left to the original offset, shifting in padding pixels.
44: // 4 pixels valid
ext v2.16b, v2.16b, v2.16b, #8
ext v2.16b, v2.16b, v3.16b, #8
ext v4.16b, v4.16b, v4.16b, #8
ext v4.16b, v4.16b, v5.16b, #8
b 88f
55: // 5 pixels valid
ext v2.16b, v2.16b, v2.16b, #10
ext v2.16b, v2.16b, v3.16b, #6
ext v4.16b, v4.16b, v4.16b, #10
ext v4.16b, v4.16b, v5.16b, #6
b 88f
66: // 6 pixels valid
ext v2.16b, v2.16b, v2.16b, #12
ext v2.16b, v2.16b, v3.16b, #4
ext v4.16b, v4.16b, v4.16b, #12
ext v4.16b, v4.16b, v5.16b, #4
66: // 6 pixels valid, fill the upper 2 pixels with padding.
ins v2.s[3], v3.s[0]
ins v4.s[3], v5.s[0]
b 88f
77: // 7 pixels valid
ext v2.16b, v2.16b, v2.16b, #14
ext v2.16b, v2.16b, v3.16b, #2
ext v4.16b, v4.16b, v4.16b, #14
ext v4.16b, v4.16b, v5.16b, #2
77: // 7 pixels valid, fill the last pixel with padding.
ins v2.h[7], v3.h[0]
ins v4.h[7], v5.h[0]
b 88f
L(variable_shift_tbl):
......
......@@ -29,8 +29,6 @@
#include "src/looprestoration.h"
#include "common/attributes.h"
#include "common/intops.h"
#include "src/tables.h"
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
......
#define VERSION_NUMBER @VERSION_MAJOR@,@VERSION_MINOR@,@VERSION_REVISION@,@VERSION_EXTRA@
#define VERSION_NUMBER_STR "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_REVISION@"
#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
#include <windows.h>
1 VERSIONINFO
FILETYPE VFT_DLL
FILEOS VOS_NT_WINDOWS32
PRODUCTVERSION VERSION_NUMBER
FILEVERSION VERSION_NUMBER
PRODUCTVERSION PROJECT_VERSION_NUMBER
FILEVERSION API_VERSION_NUMBER
BEGIN
BLOCK "StringFileInfo"
BEGIN
......@@ -15,9 +17,9 @@ BEGIN
BEGIN
VALUE "CompanyName", "VideoLAN"
VALUE "ProductName", "dav1d"
VALUE "ProductVersion", VERSION_NUMBER_STR
VALUE "FileVersion", VERSION_NUMBER_STR
VALUE "FileDescription", "dav1d AV1 decoder"
VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
VALUE "FileVersion", API_VERSION_NUMBER_STR
VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
VALUE "InternalName", "dav1d"
VALUE "OriginalFilename", "libdav1d.dll"
VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
......
......@@ -56,14 +56,6 @@ const char *dav1d_version(void) {
return DAV1D_VERSION;
}
const char *dav1d_version_vcs(void) {
return DAV1D_VERSION_VCS;
}
unsigned int dav1d_version_int(void) {
return DAV1D_VERSION_INT;
}
void dav1d_default_settings(Dav1dSettings *const s) {
s->n_frame_threads = 1;
s->n_tile_threads = 1;
......
......@@ -147,10 +147,12 @@ if host_machine.system() == 'windows' and get_option('default_library') != 'stat
rc_version_array = meson.project_version().split('.')
winmod = import('windows')
rc_data = configuration_data()
rc_data.set('VERSION_MAJOR', rc_version_array[0])
rc_data.set('VERSION_MINOR', rc_version_array[1])
rc_data.set('VERSION_REVISION', rc_version_array[2])
rc_data.set('VERSION_EXTRA', '0')
rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
rc_data.set('COPYRIGHT_YEARS', '2019')
rc_file = configure_file(
......@@ -201,7 +203,7 @@ endforeach
if host_machine.system() == 'windows'
dav1d_soversion = ''
else
dav1d_soversion = dav1d_version_major
dav1d_soversion = dav1d_api_version_major
endif
libdav1d = library('dav1d',
......
......@@ -33,10 +33,13 @@ pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_128: times 2 dw 128
pw_2048: times 2 dw 2048
tap_table: dw 4, 2, 3, 3, 2, 1
tap_table: ; masks for 8 bit shifts
db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
; weights
db 4, 2, 3, 3, 2, 1
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
db 0 * 16 + 1, 0 * 16 + 2
......@@ -55,56 +58,59 @@ tap_table: dw 4, 2, 3, 3, 2, 1
SECTION .text
%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride
%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride
; load p0/p1
movsx offq, byte [dirq+kq+%1] ; off1
%if %5 == 4
movq xm5, [stkq+offq*2+%6*0] ; p0
movq xm6, [stkq+offq*2+%6*2]
movhps xm5, [stkq+offq*2+%6*1]
movhps xm6, [stkq+offq*2+%6*3]
%if %6 == 4
movq xm5, [stkq+offq*2+%7*0] ; p0
movq xm6, [stkq+offq*2+%7*2]
movhps xm5, [stkq+offq*2+%7*1]
movhps xm6, [stkq+offq*2+%7*3]
vinserti128 m5, xm6, 1
%else
movu xm5, [stkq+offq*2+%6*0] ; p0
vinserti128 m5, [stkq+offq*2+%6*1], 1
movu xm5, [stkq+offq*2+%7*0] ; p0
vinserti128 m5, [stkq+offq*2+%7*1], 1
%endif
neg offq ; -off1
%if %5 == 4
movq xm6, [stkq+offq*2+%6*0] ; p1
movq xm9, [stkq+offq*2+%6*2]
movhps xm6, [stkq+offq*2+%6*1]
movhps xm9, [stkq+offq*2+%6*3]
%if %6 == 4
movq xm6, [stkq+offq*2+%7*0] ; p1
movq xm9, [stkq+offq*2+%7*2]
movhps xm6, [stkq+offq*2+%7*1]
movhps xm9, [stkq+offq*2+%7*3]
vinserti128 m6, xm9, 1
%else
movu xm6, [stkq+offq*2+%6*0] ; p1
vinserti128 m6, [stkq+offq*2+%6*1], 1
movu xm6, [stkq+offq*2+%7*0] ; p1
vinserti128 m6, [stkq+offq*2+%7*1], 1
%endif
pcmpeqw m9, m14, m5
pcmpeqw m10, m14, m6
pandn m9, m5
pandn m10, m6
pmaxsw m7, m9 ; max after p0
pminsw m8, m5 ; min after p0
pmaxsw m7, m10 ; max after p1
pminsw m8, m6 ; min after p1
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
pmaxsw m7, m5 ; max after p0
pminuw m8, m5 ; min after p0
pmaxsw m7, m6 ; max after p1
pminuw m8, m6 ; min after p1
; accumulate sum[m15] over p0/p1
; calculate difference before converting
psubw m5, m4 ; diff_p0(p0 - px)
psubw m6, m4 ; diff_p1(p1 - px)
pabsw m9, m5
pabsw m10, m6
psignw m11, %4, m5
psignw m12, %4, m6
psrlw m5, m9, %2
psrlw m6, m10, %2
psubusw m5, %3, m5
psubusw m6, %3, m6
pminsw m5, m9 ; constrain(diff_p0)
pminsw m6, m10 ; constrain(diff_p1)
pmullw m5, m11 ; constrain(diff_p0) * taps
pmullw m6, m12 ; constrain(diff_p1) * taps
; convert to 8-bits with signed saturation
; saturating to large diffs has no impact on the results
packsswb m5, m6
; group into pairs so we can accumulate using maddubsw
pshufb m5, m12
pabsb m9, m5
psignb m10, %5, m5
psrlw m5, m9, %2 ; emulate 8-bit shift
pand m5, %3
psubusb m5, %4, m5
; use unsigned min since abs diff can equal 0x80
pminub m5, m9
pmaddubsw m5, m10
paddw m15, m5
paddw m15, m6
%endmacro
%macro cdef_filter_fn 3 ; w, h, stride
......@@ -118,7 +124,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
%endif
%define px rsp+2*16+2*%3
pcmpeqw m14, m14
psrlw m14, 1 ; 0x7fff
psllw m14, 15 ; 0x8000
mov edged, r8m
; prepare pixel buffers - body/right
......@@ -358,6 +364,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
%undef edged
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
movifnidn prid, prim
movifnidn secd, secm
mov dampingd, r7m
......@@ -378,21 +387,25 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
movd xm0, prid
movd xm1, secd
vpbroadcastw m0, xm0 ; pri_strength
vpbroadcastw m1, xm1 ; sec_strength
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
and prid, 1
lea tapq, [tap_table]
lea priq, [tapq+priq*4] ; pri_taps
lea secq, [tapq+8] ; sec_taps
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
mov dird, r6m
lea tapq, [tapq+dirq*2+12]
lea dirq, [tapq+dirq*2+14]
%if %1*%2*2/mmsize > 1
%if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
......@@ -404,7 +417,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
%endif
lea stkq, [px]
pxor m13, m13
pxor m11, m11
%if %1*%2*2/mmsize > 1
.v_loop:
%endif
......@@ -423,20 +436,20 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
mova m7, m4 ; max
mova m8, m4 ; min
.k_loop:
vpbroadcastw m2, [priq+kq*2] ; pri_taps
vpbroadcastw m3, [secq+kq*2] ; sec_taps
vpbroadcastb m2, [priq+kq] ; pri_taps
vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3
ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3
ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3
ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3
ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3
ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3
dec kq
jge .k_loop
vpbroadcastd m12, [pw_2048]
pcmpgtw m11, m13, m15
paddw m15, m11
pmulhrsw m15, m12
vpbroadcastd m10, [pw_2048]
pcmpgtw m9, m11, m15
paddw m15, m9
pmulhrsw m15, m10
paddw m4, m15
pminsw m4, m7
pmaxsw m4, m8
......@@ -586,9 +599,8 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; and [upper half]:
; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
vbroadcasti128 m14, [shufw_210xxxxx]
pslldq m4, m11, 2
psrldq m11, 14
pslldq m5, m12, 4
......@@ -602,7 +614,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left
pshufb m11, m14
pshuflw m11, m11, q3012
punpckhwd m6, m4, m11
punpcklwd m4, m11
pmaddwd m6, m6
......@@ -617,7 +629,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; and [upper half]:
; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
pslldq m5, m1, 2
psrldq m1, 14
......@@ -630,7 +642,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left
pshufb m1, m14
pshuflw m1, m1, q3012
punpckhwd m6, m5, m1
punpcklwd m5, m1
pmaddwd m6, m6
......
......@@ -38,6 +38,7 @@ decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
......@@ -45,6 +46,7 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH ==8
c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
......
......@@ -29,10 +29,17 @@
SECTION_RODATA 16
%if ARCH_X86_32
pb_0: times 16 db 0
%endif
pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_2048: times 8 dw 2048
pw_0x7FFF: times 8 dw 0x7FFF
pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: dw 4, 2, 3, 3, 2, 1
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
......@@ -711,3 +718,589 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
cdef_filter_fn 8, 8, 32
cdef_filter_fn 4, 8, 32
cdef_filter_fn 4, 4, 32
%macro MULLD 2
%if ARCH_X86_32
%define m15 m1
%endif
pmulhuw m15, %1, %2
pmullw %1, %2
pslld m15, 16
paddd %1, m15
%endmacro
%if ARCH_X86_64
cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
lea stride3q, [strideq*3]
movq m1, [srcq+strideq*0]
movhps m1, [srcq+strideq*1]
movq m3, [srcq+strideq*2]
movhps m3, [srcq+stride3q]
lea srcq, [srcq+strideq*4]
movq m5, [srcq+strideq*0]
movhps m5, [srcq+strideq*1]
movq m7, [srcq+strideq*2]
movhps m7, [srcq+stride3q]
pxor m8, m8
psadbw m0, m1, m8
psadbw m2, m3, m8
psadbw m4, m5, m8
psadbw m6, m7, m8
packssdw m0, m2
packssdw m4, m6
packssdw m0, m4
SWAP m0, m9
punpcklbw m0, m1, m8
punpckhbw m1, m8
punpcklbw m2, m3, m8
punpckhbw m3, m8
punpcklbw m4, m5, m8
punpckhbw m5, m8
punpcklbw m6, m7, m8
punpckhbw m7, m8
mova m8, [pw_128]
psubw m0, m8
psubw m1, m8
psubw m2, m8
psubw m3, m8
psubw m4, m8
psubw m5, m8
psubw m6, m8
psubw m7, m8
psllw m8, 3
psubw m9, m8 ; partial_sum_hv[0]
paddw m8, m0, m1
paddw m10, m2, m3
paddw m8, m4
paddw m10, m5
paddw m8, m6
paddw m10, m7
paddw m8, m10 ; partial_sum_hv[1]
pmaddwd m8, m8
pmaddwd m9, m9
phaddd m9, m8
SWAP m8, m9
MULLD m8, [div_table+48]
pslldq m9, m1, 2
psrldq m10, m1, 14
pslldq m11, m2, 4
psrldq m12, m2, 12
pslldq m13, m3, 6
psrldq m14, m3, 10
paddw m9, m0
paddw m10, m12
paddw m11, m13
paddw m10, m14 ; partial_sum_diag[0] top/right half
paddw m9, m11 ; partial_sum_diag[0] top/left half
pslldq m11, m4, 8
psrldq m12, m4, 8
pslldq m13, m5, 10
psrldq m14, m5, 6
paddw m9, m11
paddw m10, m12
paddw m9, m13
paddw m10, m14
pslldq m11, m6, 12
psrldq m12, m6, 4
pslldq m13, m7, 14
psrldq m14, m7, 2
paddw m9, m11
paddw m10, m12
paddw m9, m13 ; partial_sum_diag[0][0-7]
paddw m10, m14 ; partial_sum_diag[0][8-14,zero]
pshufb m10, [shufw_6543210x]
punpckhwd m11, m9, m10
punpcklwd m9, m10
pmaddwd m11, m11
pmaddwd m9, m9
MULLD m11, [div_table+16]
MULLD m9, [div_table+0]
paddd m9, m11 ; cost[0a-d]
pslldq m10, m0, 14
psrldq m11, m0, 2
pslldq m12, m1, 12
psrldq m13, m1, 4
pslldq m14, m2, 10
psrldq m15, m2, 6
paddw m10, m12
paddw m11, m13
paddw m10, m14
paddw m11, m15
pslldq m12, m3, 8
psrldq m13, m3, 8
pslldq m14, m4, 6
psrldq m15, m4, 10
paddw m10, m12
paddw m11, m13
paddw m10, m14
paddw m11, m15
pslldq m12, m5, 4
psrldq m13, m5, 12
pslldq m14, m6, 2
psrldq m15, m6, 14
paddw m10, m12
paddw m11, m13
paddw m10, m14
paddw m11, m15 ; partial_sum_diag[1][8-14,zero]
paddw m10, m7 ; partial_sum_diag[1][0-7]
pshufb m11, [shufw_6543210x]
punpckhwd m12, m10, m11
punpcklwd m10, m11
pmaddwd m12, m12
pmaddwd m10, m10
MULLD m12, [div_table+16]
MULLD m10, [div_table+0]
paddd m10, m12 ; cost[4a-d]
phaddd m9, m10 ; cost[0a/b,4a/b]
paddw m10, m0, m1
paddw m11, m2, m3
paddw m12, m4, m5
paddw m13, m6, m7
phaddw m0, m4
phaddw m1, m5
phaddw m2, m6
phaddw m3, m7
; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
pslldq m4, m11, 2
psrldq m5, m11, 14
pslldq m6, m12, 4
psrldq m7, m12, 12
pslldq m14, m13, 6
psrldq m15, m13, 10
paddw m4, m10
paddw m5, m7
paddw m4, m6
paddw m5, m15 ; partial_sum_alt[3] right
paddw m4, m14 ; partial_sum_alt[3] left
pshuflw m5, m5, q3012
punpckhwd m6, m4, m5
punpcklwd m4, m5
pmaddwd m6, m6
pmaddwd m4, m4
MULLD m6, [div_table+48]
MULLD m4, [div_table+32]
paddd m4, m6 ; cost[7a-d]
pslldq m5, m10, 6
psrldq m6, m10, 10
pslldq m7, m11, 4
psrldq m10, m11, 12
pslldq m11, m12, 2
psrldq m12, 14
paddw m5, m7
paddw m6, m10
paddw m5, m11
paddw m6, m12
paddw m5, m13
pshuflw m6, m6, q3012
punpckhwd m7, m5, m6
punpcklwd m5, m6
pmaddwd m7, m7
pmaddwd m5, m5
MULLD m7, [div_table+48]
MULLD m5, [div_table+32]
paddd m5, m7 ; cost[5a-d]
pslldq m6, m1, 2
psrldq m7, m1, 14
pslldq m10, m2, 4
psrldq m11, m2, 12
pslldq m12, m3, 6
psrldq m13, m3, 10
paddw m6, m0
paddw m7, m11
paddw m6, m10
paddw m7, m13 ; partial_sum_alt[3] right
paddw m6, m12 ; partial_sum_alt[3] left
pshuflw m7, m7, q3012
punpckhwd m10, m6, m7
punpcklwd m6, m7
pmaddwd m10, m10
pmaddwd m6, m6
MULLD m10, [div_table+48]
MULLD m6, [div_table+32]
paddd m6, m10 ; cost[1a-d]
pshufd m0, m0, q1032
pshufd m1, m1, q1032
pshufd m2, m2, q1032
pshufd m3, m3, q1032
pslldq m10, m0, 6
psrldq m11, m0, 10
pslldq m12, m1, 4
psrldq m13, m1, 12
pslldq m14, m2, 2
psrldq m2, 14
paddw m10, m12
paddw m11, m13
paddw m10, m14
paddw m11, m2
paddw m10, m3
pshuflw m11, m11, q3012
punpckhwd m12, m10, m11
punpcklwd m10, m11
pmaddwd m12, m12
pmaddwd m10, m10
MULLD m12, [div_table+48]
MULLD m10, [div_table+32]
paddd m10, m12 ; cost[3a-d]
phaddd m0, m9, m8 ; cost[0,4,2,6]
phaddd m6, m5
phaddd m10, m4
phaddd m1, m6, m10 ; cost[1,5,3,7]
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
pand m3, m2, m1
pandn m4, m2, m0
por m3, m4 ; higher 4 values
pshufd m1, m1, q2301
pshufd m0, m0, q2301
pand m1, m2, m1
pandn m4, m2, m0
por m0, m4, m1 ; 4 values at idx^4 offset
pand m14, m2, [pd_0to7+16]
pandn m15, m2, [pd_0to7]
por m15, m14
punpckhqdq m4, m3, m0
punpcklqdq m3, m0
pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5]
punpcklqdq m5, m5
pand m6, m5, m4
pandn m7, m5, m3
por m6, m7 ; { highest 2 values, complements at idx^4 }
movhlps m14, m15
pand m14, m5, m14
pandn m13, m5, m15
por m15, m13, m14
pshufd m7, m6, q3311
pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3]
punpcklqdq m8, m8
pand m9, m8, m7
pandn m10, m8, m6
por m9, m10 ; max
movhlps m10, m9 ; complement at idx^4
psubd m9, m10
psrld m9, 10
movd [varq], m9
pshufd m14, m15, q1111
pand m14, m8, m14
pandn m13, m8, m15
por m15, m13, m14
movd eax, m15
%else
cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
%define PIC_reg r4
LEA PIC_reg, PIC_base_offset
pxor m0, m0
mova m1, [PIC_sym(pw_128)]
lea stride3q, [strideq*3]
movq m5, [srcq+strideq*0]
movhps m5, [srcq+strideq*1]
movq m7, [srcq+strideq*2]
movhps m7, [srcq+stride3q]
psadbw m2, m5, m0
psadbw m3, m7, m0
packssdw m2, m3
punpcklbw m4, m5, m0
punpckhbw m5, m0
punpcklbw m6, m7, m0
punpckhbw m7, m0
psubw m4, m1
psubw m5, m1
psubw m6, m1
psubw m7, m1
mova [esp+0x00], m4
mova [esp+0x10], m5
mova [esp+0x20], m6
mova [esp+0x50], m7
lea srcq, [srcq+strideq*4]
movq m5, [srcq+strideq*0]
movhps m5, [srcq+strideq*1]
movq m7, [srcq+strideq*2]
movhps m7, [srcq+stride3q]
psadbw m3, m5, m0
psadbw m0, m7, m0
packssdw m3, m0
pxor m0, m0
packssdw m2, m3
punpcklbw m4, m5, m0
punpckhbw m5, m0
punpcklbw m6, m7, m0
punpckhbw m7, m0
psubw m4, m1
psubw m5, m1
psubw m6, m1
psubw m7, m1
psllw m1, 3
psubw m2, m1 ; partial_sum_hv[0]
pmaddwd m2, m2
mova m3, [esp+0x50]
mova m0, [esp+0x00]
paddw m0, [esp+0x10]
paddw m1, m3, [esp+0x20]
paddw m0, m4
paddw m1, m5
paddw m0, m6
paddw m1, m7
paddw m0, m1 ; partial_sum_hv[1]
pmaddwd m0, m0
phaddd m2, m0
MULLD m2, [PIC_sym(div_table)+48]
mova [esp+0x30], m2
mova m1, [esp+0x10]
pslldq m0, m1, 2
psrldq m1, 14
paddw m0, [esp+0x00]
pslldq m2, m3, 6
psrldq m3, 10
paddw m0, m2
paddw m1, m3
mova m3, [esp+0x20]
pslldq m2, m3, 4
psrldq m3, 12
paddw m0, m2 ; partial_sum_diag[0] top/left half
paddw m1, m3 ; partial_sum_diag[0] top/right half
pslldq m2, m4, 8
psrldq m3, m4, 8
paddw m0, m2
paddw m1, m3
pslldq m2, m5, 10
psrldq m3, m5, 6
paddw m0, m2
paddw m1, m3
pslldq m2, m6, 12
psrldq m3, m6, 4
paddw m0, m2
paddw m1, m3
pslldq m2, m7, 14
psrldq m3, m7, 2
paddw m0, m2 ; partial_sum_diag[0][0-7]
paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
mova m3, [esp+0x50]
pshufb m1, [PIC_sym(shufw_6543210x)]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+16]
MULLD m0, [PIC_sym(div_table)+0]
paddd m0, m2 ; cost[0a-d]
mova [esp+0x40], m0
mova m1, [esp+0x00]
pslldq m0, m1, 14
psrldq m1, 2
paddw m0, m7
pslldq m2, m3, 8
psrldq m3, 8
paddw m0, m2
paddw m1, m3
mova m3, [esp+0x20]
pslldq m2, m3, 10
psrldq m3, 6
paddw m0, m2
paddw m1, m3
mova m3, [esp+0x10]
pslldq m2, m3, 12
psrldq m3, 4
paddw m0, m2
paddw m1, m3
pslldq m2, m4, 6
psrldq m3, m4, 10
paddw m0, m2
paddw m1, m3
pslldq m2, m5, 4
psrldq m3, m5, 12
paddw m0, m2
paddw m1, m3
pslldq m2, m6, 2
psrldq m3, m6, 14
paddw m0, m2 ; partial_sum_diag[1][0-7]
paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
mova m3, [esp+0x50]
pshufb m1, [PIC_sym(shufw_6543210x)]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+16]
MULLD m0, [PIC_sym(div_table)+0]
paddd m0, m2 ; cost[4a-d]
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
mova [esp+0x30], m1
phaddw m0, [esp+0x00], m4
phaddw m1, [esp+0x10], m5
paddw m4, m5
mova m2, [esp+0x20]
paddw m5, m2, m3
phaddw m2, m6
paddw m6, m7
phaddw m3, m7
mova m7, [esp+0x00]
paddw m7, [esp+0x10]
mova [esp+0x00], m0
mova [esp+0x10], m1
mova [esp+0x20], m2
pslldq m1, m4, 4
pslldq m2, m6, 6
pslldq m0, m5, 2
paddw m1, m2
paddw m0, m7
psrldq m2, m5, 14
paddw m0, m1 ; partial_sum_alt[3] left
psrldq m1, m4, 12
paddw m1, m2
psrldq m2, m6, 10
paddw m1, m2 ; partial_sum_alt[3] right
pshuflw m1, m1, q3012
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
paddd m0, m2 ; cost[7a-d]
mova [esp+0x40], m0
pslldq m0, m7, 6
psrldq m7, 10
pslldq m1, m5, 4
psrldq m5, 12
pslldq m2, m4, 2
psrldq m4, 14
paddw m0, m6
paddw m7, m5
paddw m0, m1
paddw m7, m4
paddw m0, m2
pshuflw m7, m7, q3012
punpckhwd m2, m0, m7
punpcklwd m0, m7
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
paddd m0, m2 ; cost[5a-d]
mova [esp+0x50], m0
mova m1, [esp+0x10]
mova m2, [esp+0x20]
pslldq m0, m1, 2
psrldq m1, 14
pslldq m4, m2, 4
psrldq m2, 12
pslldq m5, m3, 6
psrldq m6, m3, 10
paddw m0, [esp+0x00]
paddw m1, m2
paddw m4, m5
paddw m1, m6 ; partial_sum_alt[3] right
paddw m0, m4 ; partial_sum_alt[3] left
pshuflw m1, m1, q3012
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
paddd m0, m2 ; cost[1a-d]
phaddd m0, [esp+0x50]
mova [esp+0x50], m0
pshufd m0, [esp+0x00], q1032
pshufd m1, [esp+0x10], q1032
pshufd m2, [esp+0x20], q1032
pshufd m3, m3, q1032
pslldq m4, m0, 6
psrldq m0, 10
pslldq m5, m1, 4
psrldq m1, 12
pslldq m6, m2, 2
psrldq m2, 14
paddw m4, m3
paddw m0, m1
paddw m5, m6
paddw m0, m2
paddw m4, m5
pshuflw m0, m0, q3012
punpckhwd m2, m4, m0
punpcklwd m4, m0
pmaddwd m2, m2
pmaddwd m4, m4
MULLD m2, [PIC_sym(div_table)+48]
MULLD m4, [PIC_sym(div_table)+32]
paddd m4, m2 ; cost[3a-d]
phaddd m4, [esp+0x40]
mova m1, [esp+0x50]
mova m0, [esp+0x30] ; cost[0,4,2,6]
phaddd m1, m4 ; cost[1,5,3,7]
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
pand m3, m2, m1
pandn m4, m2, m0
por m3, m4 ; higher 4 values
pshufd m1, m1, q2301
pshufd m0, m0, q2301
pand m1, m2, m1
pandn m4, m2, m0
por m0, m4, m1 ; 4 values at idx^4 offset
pand m5, m2, [PIC_sym(pd_0to7)+16]
pandn m6, m2, [PIC_sym(pd_0to7)]
por m6, m5
punpckhqdq m4, m3, m0
punpcklqdq m3, m0
pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5]
punpcklqdq m0, m0
pand m1, m0, m4
pandn m7, m0, m3
por m1, m7 ; { highest 2 values, complements at idx^4 }
movhlps m5, m6
pand m5, m0, m5
pandn m3, m0, m6
por m6, m3, m5
pshufd m7, m1, q3311
pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3]
punpcklqdq m2, m2
pand m0, m2, m7
pandn m7, m2, m1
por m0, m7 ; max
movhlps m7, m0 ; complement at idx^4
psubd m0, m7
psrld m0, 10
movd [varq], m0
pshufd m5, m6, q1111
pand m5, m2, m5
pandn m3, m2, m6
por m6, m3, m5
movd eax, m6
%endif
RET
......@@ -35,6 +35,7 @@ pb_14x0_1_2: times 14 db 0
db 1, 2
pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
pb_0: times 16 db 0
pb_2: times 16 db 2
pb_3: times 16 db 3
......@@ -509,17 +510,11 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
;; self-guided ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro MULLD 2-3 1 ; %3 = is_constant
pmuludq m5, %1, %2
psrlq %1, 32
%if %3 == 0
pshufd m3, %2, q2301
pmuludq %1, m3
%else
pmuludq %1, %2
%endif
shufps %1, m5, q2020
pshufd %1, %1, q1302
%macro MULLD 2
pmulhuw m5, %1, %2
pmullw %1, %2
pslld m5, 16
paddd %1, m5
%endmacro
%macro GATHERDD 2
......@@ -766,7 +761,7 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
jl .loop_x
RET
cglobal sgr_calc_ab1, 4, 7, 14, a, b, w, h, s
cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
movifnidn sd, sm
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
......@@ -777,17 +772,20 @@ cglobal sgr_calc_ab1, 4, 7, 14, a, b, w, h, s
SETUP_PIC r5, 0
%endif
movd m6, sd
pshufd m6, m6, 0
pshuflw m6, m6, q0000
punpcklqdq m6, m6
pxor m7, m7
DEFINE_ARGS a, b, w, h, x
%if ARCH_X86_64
mova m8, [pd_0xF00801C7]
mova m9, [pw_256]
psrld m10, m9, 13 ; pd_2048
mova m11, [pb_unpcklwdw]
%else
%define m8 [PIC_sym(pd_0xF00801C7)]
%define m9 [PIC_sym(pw_256)]
%define m10 [PIC_sym(pd_2048)]
%define m11 [PIC_sym(pb_unpcklwdw)]
%endif
.loop_y:
mov xq, -2
......@@ -818,10 +816,12 @@ cglobal sgr_calc_ab1, 4, 7, 14, a, b, w, h, s
GATHERDD m2, m3
psrld m4, 24
psrld m2, 24
MULLD m0, m4, 0
MULLD m1, m2, 0
packssdw m4, m2
psubw m5, m9, m4
packssdw m3, m4, m2
pshufb m4, m11
MULLD m0, m4
pshufb m2, m11
MULLD m1, m2
psubw m5, m9, m3
paddd m0, m10
paddd m1, m10
psrld m0, 12
......@@ -1516,7 +1516,8 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
SETUP_PIC r5, 0
%endif
movd m6, sd
pshufd m6, m6, 0
pshuflw m6, m6, q0000
punpcklqdq m6, m6
pxor m7, m7
DEFINE_ARGS a, b, w, h, x
%if ARCH_X86_64
......
......@@ -73,11 +73,11 @@ int main(const int argc, char *const *const argv) {
Dav1dContext *c;
Dav1dData data;
unsigned n_out = 0, total, fps[2];
const char *version = dav1d_version_vcs();
const char *version = dav1d_version();
if (strcmp(version, DAV1D_VERSION_VCS)) {
if (strcmp(version, DAV1D_VERSION)) {
fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
version, DAV1D_VERSION_VCS);
version, DAV1D_VERSION);
return -1;
}
......@@ -100,7 +100,7 @@ int main(const int argc, char *const *const argv) {
}
if (!cli_settings.quiet)
fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version_vcs());
fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version());
// skip frames until a sequence header is found
if (cli_settings.skip) {
......
......@@ -263,7 +263,7 @@ void parse(const int argc, char *const *const argv,
!!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]);
break;
case 'v':
fprintf(stderr, "%s\n", dav1d_version_vcs());
fprintf(stderr, "%s\n", dav1d_version());
exit(0);
case ARG_CPU_MASK:
dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
......