Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • videolan/x264
  • EwoutH/x264
  • gramner/x264
  • BugMaster/x264
  • MaskRay/x264
  • thresh/x264
  • tpm/x264
  • wolfired/x264
  • ifb/x264
  • robinstorm/x264
  • ltnokiago/x264
  • janne/x264
  • Kromjunya/x264
  • trisnaayu0596/x264
  • felipegarcia1402/x264
  • coder2004/x264
  • philou/x264
  • walagnatalia/x264
  • DonDiego/x264
  • JHammler/x264
  • qyot27/x264
  • dwbuiten/x264
  • Kagami/x264
  • andriy-andreyev/x264
  • gxw/x264
  • trofi/x264
  • kierank/x264
  • aureliendavid/x264
  • galad/x264
  • roommini/x264
  • ocrete/x264
  • mstorsjo/x264
  • yinsj0116/x264
  • mamonet/x264
  • 1div0/x264
  • ko1265/x264
  • sergiomb2/x264
  • xutongda/x264
  • wenzhiwu/x264
  • arrowd/x264
  • FranceBB/x264
  • ziemek99/x264
  • longervision/x264
  • xopok/x264
  • jbk/x264
  • szatmary/x264
  • pekdon/x264
  • Jiangguyu/x264
  • jrtc27/x264
  • kankanol1/x264
  • gxwLite/x264
  • brad/x264
  • Gc6026/x264
  • jdek/x264
  • appcrash/x264
  • tguillem/x264
  • As/x264
  • wevian/x264
  • wangluls/x264
  • RellikJaeger/x264
  • hum/x264
  • rogerhardiman/x264
  • jankowalski12611/x264
  • zhijie1996/x264
  • yinshiyou/x264
  • Freed-Wu/x264
  • yajcoca/x264
  • bUd/x264
  • chienvannguyen2020/x264
  • nurbinakhatun386/x264
  • Siberiawind/x-264-meson
  • HecaiYuan/x264
  • david.chen/x264
  • Ytsejam76/x264
  • robUx4/x264
  • zhaoshiz/x-264-arm64ec
  • yintong.ustc/x-264-bd-ventana
  • nekobasu/x264
  • Courmisch/x264
  • BD-qjy/x264
  • quink/x264
81 results
Show changes
Commits on Source (3)
  • Janne Grunau's avatar
    aarch64/asm: optimize cabac_encode_terminal with extrinsic knowledge · 8578bd9f
    Janne Grunau authored
    Approach taken from x86 asm. Overall speedup meaningless.
    cabac_encode_terminal on average twice as fast on cortex-53 while
    encoding with following command:
    ./x264 --threads 1 --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m
    
    Less relative speedup on cortex-a72/73.
    8578bd9f
  • Janne Grunau's avatar
    aarch64/asm: support offsets in movrel macro · 9981ea83
    Janne Grunau authored
    Imported from dav1d.
    9981ea83
  • Janne Grunau's avatar
    aarch64/asm: optimize cabac asm · 8bd6d280
    Janne Grunau authored
    0.5% - 2% overall speedup on
    `./x264 --threads X --profile high --preset veryfast --crf 15 -o /dev/null park_joy_420_720p50.y4m`
    cabac is responsible for roughly 1/6 of the CPU use.
    Branch mispredictions are reduced by 15% to 20%.
    
    cortex-s53: 0.5% faster
    cortex-a72: 2%  faster
    neoverse-n1: 0.9% faster
    8bd6d280
......@@ -26,11 +26,19 @@
#include "common/common.h"
#include "asm-offsets.h"
#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
{ \
int m_##m[2 * (offsetof(s, m) == o) - 1]; \
STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
}
#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
{ \
STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
}
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
......@@ -40,3 +48,9 @@ X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
// the aarch64 asm makes following additional assumptions about the x264_cabac_t
// memory layout
X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);
......@@ -101,15 +101,30 @@ MACH .const_data
\name:
.endm
.macro movrel rd, val
#if defined(PIC) && defined(__APPLE__)
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
#elif defined(PIC)
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif defined(PIC) && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val
ldr \rd, =\val+\offset
#endif
.endm
......
......@@ -30,54 +30,51 @@
// w12 holds x264_cabac_t.i_range
function cabac_encode_decision_asm, export=1
movrel x8, X264(cabac_range_lps)
movrel x9, X264(cabac_transition)
add w10, w1, #CABAC_STATE
ldrb w3, [x0, x10] // i_state
add w10, w1, #CABAC_STATE
ldrb w3, [x0, w10, uxtw] // i_state
ldr w12, [x0, #CABAC_I_RANGE]
and x4, x3, #~1
movrel x8, X264(cabac_range_lps), -4
movrel x9, X264(cabac_transition)
ubfx x4, x3, #1, #7
asr w5, w12, #6
add x8, x8, x4, lsl #1
sub w5, w5, #4
eor w6, w2, w3 // b ^ i_state
ldrb w4, [x8, x5] // i_range_lps
ldr w11, [x0, #CABAC_I_LOW]
add x8, x8, x4, lsl #2
orr w14, w2, w3, lsl #1
ldrb w4, [x8, w5, uxtw] // i_range_lps
ldr w11, [x0, #CABAC_I_LOW]
eor w6, w2, w3 // b ^ i_state
ldrb w9, [x9, w14, uxtw]
sub w12, w12, w4
tbz w6, #0, 1f // (b ^ i_state) & 1
add w11, w11, w12
mov w12, w4
1:
orr w4, w2, w3, lsl #1
ldrb w9, [x9, x4]
strb w9, [x0, x10] // i_state
add w7, w11, w12
tst w6, #1 // (b ^ i_state) & 1
csel w12, w4, w12, ne
csel w11, w7, w11, ne
strb w9, [x0, w10, uxtw] // i_state
cabac_encode_renorm:
clz w5, w12
ldr w2, [x0, #CABAC_I_QUEUE]
clz w5, w12
sub w5, w5, #23
lsl w12, w12, w5
lsl w11, w11, w5
2:
lsl w12, w12, w5
adds w2, w2, w5
str w12, [x0, #CABAC_I_RANGE]
b.lt 0f
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
.align 5
cabac_putbyte:
mov w13, #0x400
add w12, w2, #10
lsl w13, w13, w2
asr w4, w11, w12 // out
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
add w14, w2, #10
mov w13, #-1
sub w2, w2, #8
sub w13, w13, #1
asr w4, w11, w14 // out
lsl w13, w13, w14
subs w5, w4, #0xff
and w11, w11, w13
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
str w2, [x0, #CABAC_I_QUEUE]
b.ne 1f
add w6, w6, #1
str w11, [x0, #CABAC_I_LOW]
str w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
ret
bic w11, w11, w13
cinc w6, w6, eq
b.eq 0f
1:
ldr x7, [x0, #CABAC_P]
......@@ -93,15 +90,14 @@ cabac_putbyte:
b.gt 2b
3:
strb w4, [x7], #1
str wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
str x7, [x0, #CABAC_P]
0:
str w11, [x0, #CABAC_I_LOW]
str w2, [x0, #CABAC_I_QUEUE]
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
ret
endfunc
function cabac_encode_bypass_asm, export=1
function cabac_encode_bypass_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
ldr w11, [x0, #CABAC_I_LOW]
ldr w2, [x0, #CABAC_I_QUEUE]
......@@ -114,9 +110,22 @@ function cabac_encode_bypass_asm, export=1
ret
endfunc
function cabac_encode_terminal_asm, export=1
function cabac_encode_terminal_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
ldr w11, [x0, #CABAC_I_LOW]
sub w12, w12, #2
b cabac_encode_renorm
tbz w12, #8, 1f
str w12, [x0, #CABAC_I_RANGE]
ret
1:
ldr w2, [x0, #CABAC_I_QUEUE]
ldr w11, [x0, #CABAC_I_LOW]
lsl w12, w12, #1
adds w2, w2, #1
lsl w11, w11, #1
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
endfunc