diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c
index 279cc01029efce821a24c1cfcf921851bf432dfc..95f228b88b847e4017abbc03ccf429109d2280e2 100644
--- a/common/aarch64/asm-offsets.c
+++ b/common/aarch64/asm-offsets.c
@@ -26,11 +26,19 @@
 #include "common/common.h"
 #include "asm-offsets.h"
 
+#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
+
 #define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
 { \
-    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+    STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
+}
+
+#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
+{ \
+    STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
 }
 
+
 X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
 X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
 X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
@@ -40,3 +48,9 @@ X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
 X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
 X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
 X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
+
+// the aarch64 asm makes following additional assumptions about the x264_cabac_t
+// memory layout
+
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_low,    int, i_range);
+X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue,  int, i_bytes_outstanding);
diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S
index 8c28c82507746a960f9bf270e4a178b26fc8a4f8..cff1a06e556273e19ca72ce6a558402f0d354b78 100644
--- a/common/aarch64/cabac-a.S
+++ b/common/aarch64/cabac-a.S
@@ -116,7 +116,20 @@ endfunc
 
 function cabac_encode_terminal_asm, export=1
     ldr         w12, [x0, #CABAC_I_RANGE]
-    ldr         w11, [x0, #CABAC_I_LOW]
     sub         w12, w12, #2
-    b           cabac_encode_renorm
+    tbz         w12, #8, 1f
+
+    str         w12, [x0, #CABAC_I_RANGE]
+    ret
+1:
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    lsl         w12, w12, #1
+    adds        w2,  w2,  #1
+    lsl         w11, w11, #1
+    b.ge        cabac_putbyte
+
+    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
 endfunc