Commit 98e9543b authored by Martin Storsjö's avatar Martin Storsjö Committed by Anton Mitrofanov

aarch64: Update the var2 functions to the new signature

The existing functions could easily be used by just calling them
twice - this would give the following cycle numbers from checkasm:

var2_8x8_c:      4110
var2_8x8_neon:   1505
var2_8x16_c:     8019
var2_8x16_neon:  2545

However, by merging both passes into the same function, we get the
following speedup:
var2_8x8_neon:   1205
var2_8x16_neon:  2327
parent 824802ad
......@@ -569,57 +569,65 @@ endfunc
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
mov x5, \h - 4
usubl v6.8h, v16.8b, v18.8b
usubl v7.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smull v2.4s, v6.4h, v6.4h
smull2 v3.4s, v6.8h, v6.8h
add v0.8h, v6.8h, v7.8h
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
mov x3, #16
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
mov x5, \h - 2
usubl v0.8h, v16.8b, v18.8b
usubl v1.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smull v2.4s, v0.4h, v0.4h
smull2 v3.4s, v0.8h, v0.8h
smull v4.4s, v1.4h, v1.4h
smull2 v5.4s, v1.8h, v1.8h
usubl v6.8h, v16.8b, v18.8b
1: subs x5, x5, #2
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
1: subs x5, x5, #1
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smlal v4.4s, v7.4h, v7.4h
smlal2 v5.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
add v0.8h, v0.8h, v7.8h
add v1.8h, v1.8h, v7.8h
b.gt 1b
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
smlal v2.4s, v7.4h, v7.4h
add v0.8h, v0.8h, v7.8h
smlal2 v3.4s, v7.8h, v7.8h
smlal v4.4s, v7.4h, v7.4h
add v1.8h, v1.8h, v7.8h
smlal2 v5.4s, v7.8h, v7.8h
saddlv s0, v0.8h
saddlv s1, v1.8h
add v2.4s, v2.4s, v3.4s
add v4.4s, v4.4s, v5.4s
mov w0, v0.s[0]
addv s1, v2.4s
sxtw x0, w0
mov w1, v1.s[0]
mul x0, x0, x0
str w1, [x4]
sub x0, x1, x0, lsr # 6 + (\h >> 4)
addv s2, v2.4s
addv s4, v4.4s
mul w0, w0, w0
mul w1, w1, w1
mov w3, v2.s[0]
mov w4, v4.s[0]
sub w0, w3, w0, lsr # 6 + (\h >> 4)
sub w1, w4, w1, lsr # 6 + (\h >> 4)
str w3, [x2]
add w0, w0, w1
str w4, [x2, #4]
ret
endfunc
......
......@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
......
......@@ -1452,8 +1452,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->asd8 = x264_pixel_asd8_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment