Commit d040d285 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: implement x264_pixel_sa8d_satd_16x16_neon

~20% faster than calling pixel_sa8d_16x16 and pixel_satd_16x16
separately.
parent 91a01d4c
......@@ -803,7 +803,7 @@ endfunc
function x264_pixel_sa8d_8x8_neon, export=1
mov x4, x30
bl x264_sa8d_8x8_neon
bl pixel_sa8d_8x8_neon
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
......@@ -814,20 +814,20 @@ endfunc
function x264_pixel_sa8d_16x16_neon, export=1
mov x4, x30
bl x264_sa8d_8x8_neon
bl pixel_sa8d_8x8_neon
uaddlp v30.4s, v0.8h
uaddlp v31.4s, v1.8h
bl x264_sa8d_8x8_neon
bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
sub x0, x0, x1, lsl #4
sub x2, x2, x3, lsl #4
add x0, x0, #8
add x2, x2, #8
bl x264_sa8d_8x8_neon
bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
bl x264_sa8d_8x8_neon
bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
add v0.4s, v30.4s, v31.4s
......@@ -838,13 +838,48 @@ function x264_pixel_sa8d_16x16_neon, export=1
ret x4
endfunc
function x264_sa8d_8x8_neon
.macro sa8d_satd_8x8 satd=
function pixel_sa8d_\satd\()8x8_neon
load_diff_fly_8x8
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
.ifc \satd, satd_
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
transpose v4.4s, v6.4s, v24.4s, v26.4s
transpose v5.4s, v7.4s, v25.4s, v27.4s
transpose v24.4s, v26.4s, v0.4s, v2.4s
transpose v25.4s, v27.4s, v1.4s, v3.4s
abs v0.8h, v4.8h
abs v1.8h, v5.8h
abs v2.8h, v6.8h
abs v3.8h, v7.8h
abs v4.8h, v24.8h
abs v5.8h, v25.8h
abs v6.8h, v26.8h
abs v7.8h, v27.8h
umax v0.8h, v0.8h, v2.8h
umax v1.8h, v1.8h, v3.8h
umax v2.8h, v4.8h, v6.8h
umax v3.8h, v5.8h, v7.8h
add v26.8h, v0.8h, v1.8h
add v27.8h, v2.8h, v3.8h
.endif
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
......@@ -855,15 +890,15 @@ function x264_sa8d_8x8_neon
transpose v22.8h, v23.8h, v18.8h, v19.8h
transpose v6.8h, v7.8h, v2.8h, v3.8h
SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
transpose v20.4s, v22.4s, v28.4s, v0.4s
transpose v21.4s, v23.4s, v29.4s, v1.4s
transpose v16.4s, v18.4s, v24.4s, v26.4s
transpose v17.4s, v19.4s, v25.4s, v27.4s
transpose v20.4s, v22.4s, v2.4s, v0.4s
transpose v21.4s, v23.4s, v3.4s, v1.4s
transpose v16.4s, v18.4s, v24.4s, v4.4s
transpose v17.4s, v19.4s, v25.4s, v5.4s
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
......@@ -894,7 +929,47 @@ function x264_sa8d_8x8_neon
ret
endfunc
.endm
sa8d_satd_8x8
sa8d_satd_8x8 satd_
function x264_pixel_sa8d_satd_16x16_neon, export=1
mov x4, x30
bl pixel_sa8d_satd_8x8_neon
uaddlp v30.4s, v0.8h
uaddlp v31.4s, v1.8h
uaddlp v28.4s, v26.8h
uaddlp v29.4s, v27.8h
bl pixel_sa8d_satd_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
uadalp v28.4s, v26.8h
uadalp v29.4s, v27.8h
sub x0, x0, x1, lsl #4
sub x2, x2, x3, lsl #4
add x0, x0, #8
add x2, x2, #8
bl pixel_sa8d_satd_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
uadalp v28.4s, v26.8h
uadalp v29.4s, v27.8h
bl pixel_sa8d_satd_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
uadalp v28.4s, v26.8h
uadalp v29.4s, v27.8h
add v0.4s, v30.4s, v31.4s // sa8d
add v1.4s, v28.4s, v29.4s // satd
addv s0, v0.4s
addv s1, v1.4s
urshr v0.4s, v0.4s, #1
fmov w0, s0
fmov w1, s1
add x0, x0, x1, lsl #32
ret x4
endfunc
.macro HADAMARD_AC w h
function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
......
......@@ -4,6 +4,7 @@
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -49,6 +50,7 @@ DECL_X1( ssd, neon )
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
......
......@@ -1422,6 +1422,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment