Compare revisions

Kacper Michajłow · Ronald S. Bultje · Cameron Cawley · Ronald S. Bultje · Arpad Panyik · Martin Storsjö
--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -31,10 +31,10 @@
 #include <errno.h>
 #include <stdarg.h>

-#include "common.h"
-#include "picture.h"
-#include "data.h"
-#include "version.h"
+#include "dav1d/common.h"
+#include "dav1d/picture.h"
+#include "dav1d/data.h"
+#include "dav1d/version.h"

 #ifdef __cplusplus
 extern "C" {

--- a/meson.build
+++ b/meson.build
@@ -157,6 +157,12 @@ else
    if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
        cdata.set('HAVE_POSIX_MEMALIGN', 1)
    endif
+    if cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+        cdata.set('HAVE_MEMALIGN', 1)
+    endif
+    if cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
+        cdata.set('HAVE_ALIGNED_ALLOC', 1)
+    endif
 endif

 # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1549,7 +1549,11 @@ L(\type\()_\taps\()_h):
 80:     // 8xN h
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
        lsl             \d_strd,  \d_strd,  #1
@@ -1564,25 +1568,23 @@ L(\type\()_\taps\()_h):
        uxtl            v21.8h,  v21.8b

 .ifc \taps, 6tap
-        ext             v19.16b, v16.16b, v17.16b, #2
-        ext             v23.16b, v20.16b, v21.16b, #2
-        mul             v18.8h,  v19.8h,  v0.h[1]
-        mul             v22.8h,  v23.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mul             v18.8h,  v16.8h,  v0.h[1]
+        mul             v22.8h,  v20.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i-2)
        mla             v18.8h,  v19.8h,  v0.h[\i]
        mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v18.8h,  v16.8h,  v0.h[0]
        mul             v22.8h,  v20.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
        mla             v18.8h,  v19.8h,  v0.h[\i]
        mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        subs            \h,  \h,  #2
        srshr           v18.8h,  v18.8h, #2
@@ -1604,7 +1606,11 @@ L(\type\()_\taps\()_h):
 1280:   // 16xN, 32xN, ... h
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
        lsl             \s_strd,  \s_strd,  #1
@@ -1629,30 +1635,26 @@ L(\type\()_\taps\()_h):

 16:
 .ifc \taps, 6tap
-        ext             v28.16b, v16.16b, v17.16b, #2
-        ext             v29.16b, v17.16b, v18.16b, #2
-        ext             v30.16b, v20.16b, v21.16b, #2
-        ext             v31.16b, v21.16b, v22.16b, #2
-        mul             v24.8h,  v28.8h,  v0.h[1]
-        mul             v25.8h,  v29.8h,  v0.h[1]
-        mul             v26.8h,  v30.8h,  v0.h[1]
-        mul             v27.8h,  v31.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
-        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
-        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mul             v24.8h,  v16.8h,  v0.h[1]
+        mul             v25.8h,  v17.8h,  v0.h[1]
+        mul             v26.8h,  v20.8h,  v0.h[1]
+        mul             v27.8h,  v21.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i-2)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i-2)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i-2)
        mla             v24.8h,  v28.8h,  v0.h[\i]
        mla             v25.8h,  v29.8h,  v0.h[\i]
        mla             v26.8h,  v30.8h,  v0.h[\i]
        mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v24.8h,  v16.8h,  v0.h[0]
        mul             v25.8h,  v17.8h,  v0.h[0]
        mul             v26.8h,  v20.8h,  v0.h[0]
        mul             v27.8h,  v21.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
@@ -1661,7 +1663,7 @@ L(\type\()_\taps\()_h):
        mla             v25.8h,  v29.8h,  v0.h[\i]
        mla             v26.8h,  v30.8h,  v0.h[\i]
        mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        srshr           v24.8h,  v24.8h, #2
        srshr           v25.8h,  v25.8h, #2
@@ -2386,7 +2388,11 @@ L(\type\()_\taps\()_filter_4):
        add             \xmy,  \xmy,  #2
        ld1             {v0.8b},  [\xmx]
        ld1             {v1.s}[0],  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        sub             \src,  \src,  \s_strd
        sxtl            v0.8h,  v0.8b
        sxtl            v1.8h,  v1.8b
@@ -2464,8 +2470,10 @@ L(\type\()_\taps\()_filter_4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b},  [\xmx]
        ld1             {v1.8b},  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
-.ifc \taps, 8tap
        sub             \src,  \src,  \s_strd
 .endif
        sub             \src,  \src,  \s_strd, lsl #1
@@ -2609,17 +2617,15 @@ L(\type\()_\taps\()_filter_8_first):
        uxtl            v28.8h,  v28.8b
        uxtl            v29.8h,  v29.8b
 .ifc \taps, 6tap
-        ext             v24.16b, v28.16b, v29.16b, #(2*1)
-        ext             v25.16b, v28.16b, v29.16b, #(2*2)
-        ext             v26.16b, v28.16b, v29.16b, #(2*3)
-        ext             v27.16b, v28.16b, v29.16b, #(2*4)
-        mul             v16.8h,  v24.8h,  v0.h[1]
+        mul             v16.8h,  v28.8h,  v0.h[1]
+        ext             v25.16b, v28.16b, v29.16b, #(2*1)
+        ext             v26.16b, v28.16b, v29.16b, #(2*2)
+        ext             v27.16b, v28.16b, v29.16b, #(2*3)
        mla             v16.8h,  v25.8h,  v0.h[2]
        mla             v16.8h,  v26.8h,  v0.h[3]
        mla             v16.8h,  v27.8h,  v0.h[4]
-        ext             v24.16b, v28.16b, v29.16b, #(2*5)
-        ext             v25.16b, v28.16b, v29.16b, #(2*6)
-        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        ext             v24.16b, v28.16b, v29.16b, #(2*4)
+        ext             v25.16b, v28.16b, v29.16b, #(2*5)
        mla             v16.8h,  v24.8h,  v0.h[5]
        mla             v16.8h,  v25.8h,  v0.h[6]
 .else   // 8tap
@@ -2650,25 +2656,23 @@ L(\type\()_\taps\()_filter_8):
        uxtl            v30.8h,  v30.8b
        uxtl            v31.8h,  v31.8b
 .ifc \taps, 6tap
-        ext             v26.16b, v28.16b, v29.16b, #2
-        ext             v27.16b, v30.16b, v31.16b, #2
-        mul             v24.8h,  v26.8h,  v0.h[1]
-        mul             v25.8h,  v27.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mul             v24.8h,  v28.8h,  v0.h[1]
+        mul             v25.8h,  v30.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i-2)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i-2)
        mla             v24.8h,  v26.8h,  v0.h[\i]
        mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v24.8h,  v28.8h,  v0.h[0]
        mul             v25.8h,  v30.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
        mla             v24.8h,  v26.8h,  v0.h[\i]
        mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        srshr           v24.8h,  v24.8h, #2
        srshr           v25.8h,  v25.8h, #2

--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -1601,18 +1601,18 @@ L(\type\()_\taps\()_h):
        b.ne            L(\type\()_\taps\()_hv)

        movrel          x10, \type\()_\taps\()_h_tbl
-        dup             v30.4s,  w12           // 6 - intermediate_bits
        ldrsw           x9,  [x10, x9, lsl #2]
-        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
 .ifc \type, put
-        dup             v29.8h,  \bdmax        // intermediate_bits
+        mov             w12,  #34              // rounding for 10-bit
+        mov             w13,  #40              // rounding for 12-bit
+        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2
+        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax
 .else
+        neg             w12,  w12              // -(6 - intermediate_bits)
        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
        add             x10, x10, x9
-.ifc \type, put
-        neg             v29.8h,  v29.8h        // -intermediate_bits
-.endif
+        dup             v30.4s,  w12           // rounding or shift amount
        br              x10

 20:     // 2xN h
@@ -1629,6 +1629,7 @@ L(\type\()_\taps\()_h):
 2:
        ld1             {v4.8h},  [\src], \s_strd
        ld1             {v6.8h},  [\sr2], \s_strd
+        mov             v2.16b,  v30.16b
        ext             v5.16b,  v4.16b,  v4.16b,  #2
        ext             v7.16b,  v6.16b,  v6.16b,  #2
        subs            \h,  \h,  #2
@@ -1636,16 +1637,14 @@ L(\type\()_\taps\()_h):
        trn2            v6.2s,   v4.2s,   v6.2s
        trn1            v4.2s,   v5.2s,   v7.2s
        trn2            v7.2s,   v5.2s,   v7.2s
-        smull           v3.4s,   v3.4h,   v0.h[0]
-        smlal           v3.4s,   v4.4h,   v0.h[1]
-        smlal           v3.4s,   v6.4h,   v0.h[2]
-        smlal           v3.4s,   v7.4h,   v0.h[3]
-        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
-        sqxtun          v3.4h,   v3.4s
-        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
-        umin            v3.4h,   v3.4h,   v31.4h
-        st1             {v3.s}[0], [\dst], \d_strd
-        st1             {v3.s}[1], [\ds2], \d_strd
+        smlal           v2.4s,   v3.4h,   v0.h[0]
+        smlal           v2.4s,   v4.4h,   v0.h[1]
+        smlal           v2.4s,   v6.4h,   v0.h[2]
+        smlal           v2.4s,   v7.4h,   v0.h[3]
+        sqshrun         v2.4h,   v2.4s,   #6
+        umin            v2.4h,   v2.4h,   v31.4h
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v2.s}[1], [\ds2], \d_strd
        b.gt            2b
        ret
 .endif
@@ -1663,6 +1662,10 @@ L(\type\()_\taps\()_h):
 4:
        ld1             {v16.8h}, [\src], \s_strd
        ld1             {v20.8h}, [\sr2], \s_strd
+.ifc \type, put
+        mov             v2.16b,  v30.16b
+        mov             v3.16b,  v30.16b
+.endif
        ext             v17.16b, v16.16b, v16.16b, #2
        ext             v18.16b, v16.16b, v16.16b, #4
        ext             v19.16b, v16.16b, v16.16b, #6
@@ -1670,22 +1673,29 @@ L(\type\()_\taps\()_h):
        ext             v22.16b, v20.16b, v20.16b, #4
        ext             v23.16b, v20.16b, v20.16b, #6
        subs            \h,  \h,  #2
-        smull           v16.4s,  v16.4h,  v0.h[0]
-        smlal           v16.4s,  v17.4h,  v0.h[1]
-        smlal           v16.4s,  v18.4h,  v0.h[2]
-        smlal           v16.4s,  v19.4h,  v0.h[3]
-        smull           v20.4s,  v20.4h,  v0.h[0]
-        smlal           v20.4s,  v21.4h,  v0.h[1]
-        smlal           v20.4s,  v22.4h,  v0.h[2]
-        smlal           v20.4s,  v23.4h,  v0.h[3]
-        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
 .ifc \type, put
-        sqxtun          v16.4h,  v16.4s
-        sqxtun2         v16.8h,  v20.4s
-        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
+        smlal           v2.4s,   v16.4h,  v0.h[0]
+.else
+        smull           v2.4s,   v16.4h,  v0.h[0]
+.endif
+        smlal           v2.4s,   v17.4h,  v0.h[1]
+        smlal           v2.4s,   v18.4h,  v0.h[2]
+        smlal           v2.4s,   v19.4h,  v0.h[3]
+.ifc \type, put
+        smlal           v3.4s,   v20.4h,  v0.h[0]
+.else
+        smull           v3.4s,   v20.4h,  v0.h[0]
+.endif
+        smlal           v3.4s,   v21.4h,  v0.h[1]
+        smlal           v3.4s,   v22.4h,  v0.h[2]
+        smlal           v3.4s,   v23.4h,  v0.h[3]
+.ifc \type, put
+        sqshrun         v16.4h,  v2.4s,   #6
+        sqshrun2        v16.8h,  v3.4s,   #6
        umin            v16.8h,  v16.8h,  v31.8h
 .else
+        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)
+        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)
        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
 .endif
@@ -1701,7 +1711,11 @@ L(\type\()_\taps\()_h):
 1280:   // 8xN, 16xN, 32xN, ... h
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
        sub             \src,  \src,  #6
+.endif
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
        lsl             \s_strd,  \s_strd,  #1
@@ -1720,49 +1734,67 @@ L(\type\()_\taps\()_h):

 8:
 .ifc \taps, 6tap
-        ext             v24.16b, v16.16b, v17.16b, #2
-        ext             v25.16b, v20.16b, v21.16b, #2
-        smull           v18.4s,  v24.4h,  v0.h[1]
-        smull2          v19.4s,  v24.8h,  v0.h[1]
-        smull           v22.4s,  v25.4h,  v0.h[1]
-        smull2          v23.4s,  v25.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+    .ifc \type, put
+        mov             v18.16b, v30.16b
+        mov             v19.16b, v30.16b
+        smlal           v18.4s,  v16.4h,  v0.h[1]
+        smlal2          v19.4s,  v16.8h,  v0.h[1]
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v30.16b
+        smlal           v22.4s,  v20.4h,  v0.h[1]
+        smlal2          v23.4s,  v20.8h,  v0.h[1]
+    .else
+        smull           v18.4s,  v16.4h,  v0.h[1]
+        smull2          v19.4s,  v16.8h,  v0.h[1]
+        smull           v22.4s,  v20.4h,  v0.h[1]
+        smull2          v23.4s,  v20.8h,  v0.h[1]
+    .endif
+    .irpc i, 23456
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)
        smlal           v18.4s,  v24.4h,  v0.h[\i]
        smlal2          v19.4s,  v24.8h,  v0.h[\i]
        smlal           v22.4s,  v25.4h,  v0.h[\i]
        smlal2          v23.4s,  v25.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
+    .ifc \type, put
+        mov             v18.16b, v30.16b
+        mov             v19.16b, v30.16b
+        smlal           v18.4s,  v16.4h,  v0.h[0]
+        smlal2          v19.4s,  v16.8h,  v0.h[0]
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v30.16b
+        smlal           v22.4s,  v20.4h,  v0.h[0]
+        smlal2          v23.4s,  v20.8h,  v0.h[0]
+    .else
        smull           v18.4s,  v16.4h,  v0.h[0]
        smull2          v19.4s,  v16.8h,  v0.h[0]
        smull           v22.4s,  v20.4h,  v0.h[0]
        smull2          v23.4s,  v20.8h,  v0.h[0]
-.irpc i, 1234567
+    .endif
+    .irpc i, 1234567
        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
        smlal           v18.4s,  v24.4h,  v0.h[\i]
        smlal2          v19.4s,  v24.8h,  v0.h[\i]
        smlal           v22.4s,  v25.4h,  v0.h[\i]
        smlal2          v23.4s,  v25.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        subs            \mx, \mx, #8
-        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
 .ifc \type, put
-        sqxtun          v18.4h,  v18.4s
-        sqxtun2         v18.8h,  v19.4s
-        sqxtun          v22.4h,  v22.4s
-        sqxtun2         v22.8h,  v23.4s
-        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
-        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
+        sqshrun         v18.4h,  v18.4s,  #6
+        sqshrun2        v18.8h,  v19.4s,  #6
+        sqshrun         v22.4h,  v22.4s,  #6
+        sqshrun2        v22.8h,  v23.4s,  #6
        umin            v18.8h,  v18.8h,  v31.8h
        umin            v22.8h,  v22.8h,  v31.8h
 .else
+        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
@@ -2472,7 +2504,11 @@ L(\type\()_\taps\()_filter_4):
        add             \xmy,  \xmy,  #2
        ld1             {v0.8b},  [\xmx]
        ld1             {v1.s}[0],  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
        sub             \src,  \src,  #6
+.endif
        sub             \src,  \src,  \s_strd
        sxtl            v0.8h,   v0.8b
        sxtl            v1.8h,   v1.8b
@@ -2486,13 +2522,23 @@ L(\type\()_\taps\()_filter_4):
        lsl             \s_strd, \s_strd, #1

        ld1             {v27.8h, v28.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        smull           v24.4s,  v27.4h,  v0.h[1]
+        smull2          v25.4s,  v27.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+    .endr
+.else
        smull           v24.4s,  v27.4h,  v0.h[0]
        smull2          v25.4s,  v27.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
        smlal           v24.4s,  v26.4h,  v0.h[\i]
        smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
+.endif
        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
        // The intermediates from the horizontal pass fit in 16 bit without
@@ -2570,8 +2616,10 @@ L(\type\()_\taps\()_filter_4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b},  [\xmx]
        ld1             {v1.8b},  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
        sub             \src,  \src,  #6
-.ifc \taps, 8tap
        sub             \src,  \src,  \s_strd
 .endif
        sub             \src,  \src,  \s_strd, lsl #1
@@ -2588,22 +2636,21 @@ L(\type\()_\taps\()_filter_4):

        ld1             {v27.8h, v28.8h},  [\src], \s_strd
 .ifc \taps, 6tap
-        ext             v26.16b, v27.16b, v28.16b, #2
-        smull           v24.4s,  v26.4h,  v0.h[1]
-        smull2          v25.4s,  v26.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smull           v24.4s,  v27.4h,  v0.h[1]
+        smull2          v25.4s,  v27.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
        smlal           v24.4s,  v26.4h,  v0.h[\i]
        smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        smull           v24.4s,  v27.4h,  v0.h[0]
        smull2          v25.4s,  v27.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
        smlal           v24.4s,  v26.4h,  v0.h[\i]
        smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
@@ -2745,15 +2792,13 @@ L(\type\()_\taps\()_filter_8):
        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
        ld1             {v6.8h, v7.8h},  [\src], \s_strd
 .ifc \taps, 6tap
-        ext             v23.16b, v4.16b,  v5.16b,  #2
-        ext             v24.16b, v6.16b,  v7.16b,  #2
-        smull           v25.4s,  v23.4h,  v0.h[1]
-        smull2          v26.4s,  v23.8h,  v0.h[1]
-        smull           v27.4s,  v24.4h,  v0.h[1]
-        smull2          v28.4s,  v24.8h,  v0.h[1]
+        smull           v25.4s,  v4.4h,   v0.h[1]
+        smull2          v26.4s,  v4.8h,   v0.h[1]
+        smull           v27.4s,  v6.4h,   v0.h[1]
+        smull2          v28.4s,  v6.8h,   v0.h[1]
 .irpc i, 23456
-        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
-        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)
        smlal           v25.4s,  v23.4h,  v0.h[\i]
        smlal2          v26.4s,  v23.8h,  v0.h[\i]
        smlal           v27.4s,  v24.4h,  v0.h[\i]

--- a/src/mem.c
+++ b/src/mem.c
@@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
 void *dav1d_alloc_aligned(const enum AllocationType type,
                          const size_t sz, const size_t align)
 {
-    assert(!(align & (align - 1)));
-    void *ptr;
-#ifdef _WIN32
-    ptr = _aligned_malloc(sz + align, align);
-#elif defined(HAVE_POSIX_MEMALIGN)
-    if (posix_memalign(&ptr, align, sz + align)) return NULL;
-#else
-    ptr = memalign(align, sz + align);
-#endif
-
+    void *const ptr = dav1d_alloc_aligned_internal(align, sz + align);
    return track_alloc(type, ptr, sz, align);
 }

@@ -140,12 +131,7 @@ void dav1d_free(void *ptr) {

 void dav1d_free_aligned(void *ptr) {
    if (ptr) {
-        ptr = track_free(ptr);
-#ifdef _WIN32
-        _aligned_free(ptr);
-#else
-        free(ptr);
-#endif
+        dav1d_free_aligned_internal(track_free(ptr));
    }
 }


--- a/src/mem.h
+++ b/src/mem.h
@@ -32,7 +32,7 @@

 #include <stdlib.h>

-#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
+#if defined(_WIN32) || defined(HAVE_MEMALIGN)
 #include <malloc.h>
 #endif

@@ -79,25 +79,14 @@ typedef struct Dav1dMemPool {
 #endif
 } Dav1dMemPool;

-
-#if TRACK_HEAP_ALLOCATIONS
-void *dav1d_malloc(enum AllocationType type, size_t sz);
-void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
-void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
-void dav1d_free(void *ptr);
-void dav1d_free_aligned(void *ptr);
-void dav1d_log_alloc_stats(Dav1dContext *c);
-#else
-#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
-#define dav1d_malloc(type, sz) malloc(sz)
-#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
-#define dav1d_free(ptr) free(ptr)
+// TODO: Move this to a common location?
+#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))

 /*
 * Allocate align-byte aligned memory. The return value can be released
 * by calling the dav1d_free_aligned() function.
 */
-static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
+static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
    assert(!(align & (align - 1)));
 #ifdef _WIN32
    return _aligned_malloc(sz, align);
@@ -105,13 +94,18 @@ static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
    void *ptr;
    if (posix_memalign(&ptr, align, sz)) return NULL;
    return ptr;
-#else
+#elif defined(HAVE_MEMALIGN)
    return memalign(align, sz);
+#elif defined(HAVE_ALIGNED_ALLOC)
+    // The C11 standard specifies that the size parameter
+    // must be an integral multiple of alignment.
+    return aligned_alloc(align, ROUND_UP(sz, align));
+#else
+#error No aligned allocation functions are available
 #endif
 }
-#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)

-static inline void dav1d_free_aligned(void *ptr) {
+static inline void dav1d_free_aligned_internal(void *ptr) {
 #ifdef _WIN32
    _aligned_free(ptr);
 #else
@@ -119,6 +113,20 @@ static inline void dav1d_free_aligned(void *ptr) {
 #endif
 }

+#if TRACK_HEAP_ALLOCATIONS
+void *dav1d_malloc(enum AllocationType type, size_t sz);
+void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
+void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
+void dav1d_free(void *ptr);
+void dav1d_free_aligned(void *ptr);
+void dav1d_log_alloc_stats(Dav1dContext *c);
+#else
+#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
+#define dav1d_malloc(type, sz) malloc(sz)
+#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
+#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
+#define dav1d_free(ptr) free(ptr)
+#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
 #endif /* TRACK_HEAP_ALLOCATIONS */

 void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);

--- a/src/meson.build
+++ b/src/meson.build
@@ -371,7 +371,7 @@ libdav1d = library('dav1d',
 )

 dav1d_dep = declare_dependency(link_with: libdav1d,
-    include_directories : include_directories('../include/dav1d')
+    include_directories : include_directories('../include')
 )

 #
No results found