Commit f4a82a54 authored by Janne Grunau's avatar Janne Grunau Committed by Fiona Glaser

aarch64: pixel metrics NEON asm

Ported from the ARM NEON asm.
parent 3e57554e
......@@ -126,7 +126,7 @@ endif
# AArch64 NEON optims
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
ASMSRC +=
ASMSRC += common/aarch64/pixel-a.S
SRCS +=
OBJASM = $(ASMSRC:%.S=%.o)
endif
......
......@@ -97,3 +97,17 @@ MACH .const_data
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
#define FDEC_STRIDE 32
#define FENC_STRIDE 16
.macro SUMSUB_AB sum, sub, a, b
add \sum, \a, \b
sub \sub, \a, \b
.endm
.macro transpose t1, t2, s1, s2
trn1 \t1, \s1, \s2
trn2 \t2, \s1, \s2
.endm
/*****************************************************************************
* pixel.S: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const mask
.rept 16
.byte 0xff
.endr
.rept 16
.byte 0x00
.endr
endconst
const mask_ac_4_8
.short 0, -1, -1, -1, 0, -1, -1, -1
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
.macro SAD_START_4
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
uabdl v16.8h, v0.8b, v1.8b
.endm
.macro SAD_4
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
uabal v16.8h, v0.8b, v1.8b
.endm
.macro SAD_START_8
ld1 {v1.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
uabdl v16.8h, v0.8b, v1.8b
uabdl v17.8h, v2.8b, v3.8b
.endm
.macro SAD_8
ld1 {v1.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
uabal v16.8h, v0.8b, v1.8b
uabal v17.8h, v2.8b, v3.8b
.endm
.macro SAD_START_16
ld1 {v1.16b}, [x2], x3
ld1 {v0.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
uabdl v16.8h, v0.8b, v1.8b
uabdl2 v17.8h, v0.16b, v1.16b
uabal v16.8h, v2.8b, v3.8b
uabal2 v17.8h, v2.16b, v3.16b
.endm
.macro SAD_16
ld1 {v1.16b}, [x2], x3
ld1 {v0.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
uabal v16.8h, v0.8b, v1.8b
uabal2 v17.8h, v0.16b, v1.16b
uabal v16.8h, v2.8b, v3.8b
uabal2 v17.8h, v2.16b, v3.16b
.endm
.macro SAD_FUNC w, h, name
function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
SAD_START_\w
.rept \h / 2 - 1
SAD_\w
.endr
.if \w > 4
add v16.8h, v16.8h, v17.8h
.endif
uaddlv s0, v16.8h
fmov w0, s0
ret
endfunc
.endm
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 8, 4
SAD_FUNC 8, 8
SAD_FUNC 8, 16
SAD_FUNC 16, 8
SAD_FUNC 16, 16
.macro SAD_X_4 x, first=uabal
ld1 {v0.s}[0], [x0], x7
ld1 {v1.s}[0], [x1], x5
ld1 {v0.s}[1], [x0], x7
ld1 {v1.s}[1], [x1], x5
\first v16.8h, v1.8b, v0.8b
ld1 {v2.s}[0], [x2], x5
ld1 {v2.s}[1], [x2], x5
\first v17.8h, v2.8b, v0.8b
ld1 {v3.s}[0], [x3], x5
ld1 {v3.s}[1], [x3], x5
\first v18.8h, v3.8b, v0.8b
.if \x == 4
ld1 {v4.s}[0], [x4], x5
ld1 {v4.s}[1], [x4], x5
\first v19.8h, v4.8b, v0.8b
.endif
.endm
.macro SAD_X_8 x, first=uabal
ld1 {v0.8b}, [x0], x7
ld1 {v1.8b}, [x1], x5
\first v16.8h, v1.8b, v0.8b
ld1 {v2.8b}, [x2], x5
ld1 {v5.8b}, [x0], x7
\first v17.8h, v2.8b, v0.8b
ld1 {v3.8b}, [x3], x5
ld1 {v1.8b}, [x1], x5
\first v18.8h, v3.8b, v0.8b
uabal v16.8h, v1.8b, v5.8b
ld1 {v2.8b}, [x2], x5
ld1 {v3.8b}, [x3], x5
uabal v17.8h, v2.8b, v5.8b
uabal v18.8h, v3.8b, v5.8b
.if \x == 4
ld1 {v4.8b}, [x4], x5
\first v19.8h, v4.8b, v0.8b
ld1 {v4.8b}, [x4], x5
uabal v19.8h, v4.8b, v5.8b
.endif
.endm
.macro SAD_X_16 x, first=uabal
ld1 {v0.16b}, [x0], x7
ld1 {v1.16b}, [x1], x5
\first v16.8h, v1.8b, v0.8b
\first\()2 v20.8h, v1.16b, v0.16b
ld1 {v2.16b}, [x2], x5
ld1 {v5.16b}, [x0], x7
\first v17.8h, v2.8b, v0.8b
\first\()2 v21.8h, v2.16b, v0.16b
ld1 {v3.16b}, [x3], x5
ld1 {v1.16b}, [x1], x5
\first v18.8h, v3.8b, v0.8b
\first\()2 v22.8h, v3.16b, v0.16b
uabal v16.8h, v1.8b, v5.8b
uabal2 v20.8h, v1.16b, v5.16b
ld1 {v2.16b}, [x2], x5
ld1 {v3.16b}, [x3], x5
uabal v17.8h, v2.8b, v5.8b
uabal2 v21.8h, v2.16b, v5.16b
uabal v18.8h, v3.8b, v5.8b
uabal2 v22.8h, v3.16b, v5.16b
.if \x == 4
ld1 {v4.16b}, [x4], x5
\first v19.8h, v4.8b, v0.8b
\first\()2 v23.8h, v4.16b, v0.16b
ld1 {v4.16b}, [x4], x5
uabal v19.8h, v4.8b, v5.8b
uabal2 v23.8h, v4.16b, v5.16b
.endif
.endm
.macro SAD_X_FUNC x, w, h
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
.if \x == 3
mov x6, x5
mov x5, x4
.endif
mov x7, #FENC_STRIDE
SAD_X_\w \x, uabdl
.rept \h / 2 - 1
SAD_X_\w \x
.endr
.if \w > 8
add v16.8h, v16.8h, v20.8h
add v17.8h, v17.8h, v21.8h
add v18.8h, v18.8h, v22.8h
.if \x == 4
add v19.8h, v19.8h, v23.8h
.endif
.endif
// add up the sads
uaddlv s0, v16.8h
uaddlv s1, v17.8h
uaddlv s2, v18.8h
stp s0, s1, [x6], #8
.if \x == 3
str s2, [x6]
.else
uaddlv s3, v19.8h
stp s2, s3, [x6]
.endif
ret
endfunc
.endm
SAD_X_FUNC 3, 4, 4
SAD_X_FUNC 3, 4, 8
SAD_X_FUNC 3, 8, 4
SAD_X_FUNC 3, 8, 8
SAD_X_FUNC 3, 8, 16
SAD_X_FUNC 3, 16, 8
SAD_X_FUNC 3, 16, 16
SAD_X_FUNC 4, 4, 4
SAD_X_FUNC 4, 4, 8
SAD_X_FUNC 4, 8, 4
SAD_X_FUNC 4, 8, 8
SAD_X_FUNC 4, 8, 16
SAD_X_FUNC 4, 16, 8
SAD_X_FUNC 4, 16, 16
.macro SSD_START_4
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
usubl v2.8h, v16.8b, v17.8b
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
smull v0.4s, v2.4h, v2.4h
.endm
.macro SSD_4
usubl v2.8h, v16.8b, v17.8b
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_END_4
usubl v2.8h, v16.8b, v17.8b
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_START_8
ld1 {v16.8b}, [x0], x1
ld1 {v17.8b}, [x2], x3
usubl v2.8h, v16.8b, v17.8b
ld1 {v16.8b}, [x0], x1
smull v0.4s, v2.4h, v2.4h
ld1 {v17.8b}, [x2], x3
smlal2 v0.4s, v2.8h, v2.8h
.endm
.macro SSD_8
usubl v2.8h, v16.8b, v17.8b
ld1 {v16.8b}, [x0], x1
smlal v0.4s, v2.4h, v2.4h
ld1 {v17.8b}, [x2], x3
smlal2 v0.4s, v2.8h, v2.8h
.endm
.macro SSD_END_8
usubl v2.8h, v16.8b, v17.8b
smlal v0.4s, v2.4h, v2.4h
smlal2 v0.4s, v2.8h, v2.8h
.endm
.macro SSD_START_16
ld1 {v16.16b}, [x0], x1
ld1 {v17.16b}, [x2], x3
usubl v2.8h, v16.8b, v17.8b
usubl2 v3.8h, v16.16b, v17.16b
ld1 {v16.16b}, [x0], x1
smull v0.4s, v2.4h, v2.4h
smull2 v1.4s, v2.8h, v2.8h
ld1 {v17.16b}, [x2], x3
smlal v0.4s, v3.4h, v3.4h
smlal2 v1.4s, v3.8h, v3.8h
.endm
.macro SSD_16
usubl v2.8h, v16.8b, v17.8b
usubl2 v3.8h, v16.16b, v17.16b
ld1 {v16.16b}, [x0], x1
smlal v0.4s, v2.4h, v2.4h
smlal2 v1.4s, v2.8h, v2.8h
ld1 {v17.16b}, [x2], x3
smlal v0.4s, v3.4h, v3.4h
smlal2 v1.4s, v3.8h, v3.8h
.endm
.macro SSD_END_16
usubl v2.8h, v16.8b, v17.8b
usubl2 v3.8h, v16.16b, v17.16b
smlal v0.4s, v2.4h, v2.4h
smlal2 v1.4s, v2.8h, v2.8h
smlal v0.4s, v3.4h, v3.4h
smlal2 v1.4s, v3.8h, v3.8h
add v0.4s, v0.4s, v1.4s
.endm
.macro SSD_FUNC w h
function x264_pixel_ssd_\w\()x\h\()_neon, export=1
SSD_START_\w
.rept \h-2
SSD_\w
.endr
SSD_END_\w
addv s0, v0.4s
mov w0, v0.s[0]
ret
endfunc
.endm
SSD_FUNC 4, 4
SSD_FUNC 4, 8
SSD_FUNC 8, 4
SSD_FUNC 8, 8
SSD_FUNC 8, 16
SSD_FUNC 16, 8
SSD_FUNC 16, 16
.macro pixel_var_8 h
function x264_pixel_var_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
ld1 {v17.8b}, [x0], x1
mov x2, \h - 4
umull v1.8h, v16.8b, v16.8b
uxtl v0.8h, v16.8b
umull v2.8h, v17.8b, v17.8b
uaddw v0.8h, v0.8h, v17.8b
ld1 {v18.8b}, [x0], x1
uaddlp v1.4s, v1.8h
uaddlp v2.4s, v2.8h
ld1 {v19.8b}, [x0], x1
1: subs x2, x2, #4
uaddw v0.8h, v0.8h, v18.8b
umull v24.8h, v18.8b, v18.8b
ld1 {v20.8b}, [x0], x1
uaddw v0.8h, v0.8h, v19.8b
umull v25.8h, v19.8b, v19.8b
uadalp v1.4s, v24.8h
ld1 {v21.8b}, [x0], x1
uaddw v0.8h, v0.8h, v20.8b
umull v26.8h, v20.8b, v20.8b
uadalp v2.4s, v25.8h
ld1 {v18.8b}, [x0], x1
uaddw v0.8h, v0.8h, v21.8b
umull v27.8h, v21.8b, v21.8b
uadalp v1.4s, v26.8h
ld1 {v19.8b}, [x0], x1
uadalp v2.4s, v27.8h
b.gt 1b
uaddw v0.8h, v0.8h, v18.8b
umull v28.8h, v18.8b, v18.8b
uaddw v0.8h, v0.8h, v19.8b
umull v29.8h, v19.8b, v19.8b
uadalp v1.4s, v28.8h
uadalp v2.4s, v29.8h
b x264_var_end
endfunc
.endm
pixel_var_8 8
pixel_var_8 16
function x264_pixel_var_16x16_neon, export=1
ld1 {v16.16b}, [x0], x1
ld1 {v17.16b}, [x0], x1
mov x2, #14
umull v1.8h, v16.8b, v16.8b
umull2 v2.8h, v16.16b, v16.16b
uxtl v0.8h, v16.8b
uaddlp v1.4s, v1.8h
uaddlp v2.4s, v2.8h
uaddw2 v0.8h, v0.8h, v16.16b
1: subs x2, x2, #2
ld1 {v18.16b}, [x0], x1
uaddw v0.8h, v0.8h, v17.8b
umull v3.8h, v17.8b, v17.8b
uaddw2 v0.8h, v0.8h, v17.16b
umull2 v4.8h, v17.16b, v17.16b
uadalp v1.4s, v3.8h
uadalp v2.4s, v4.8h
ld1 {v17.16b}, [x0], x1
uaddw v0.8h, v0.8h, v18.8b
umull v5.8h, v18.8b, v18.8b
uaddw2 v0.8h, v0.8h, v18.16b
umull2 v6.8h, v18.16b, v18.16b
uadalp v1.4s, v5.8h
uadalp v2.4s, v6.8h
b.gt 1b
uaddw v0.8h, v0.8h, v17.8b
umull v3.8h, v17.8b, v17.8b
uaddw2 v0.8h, v0.8h, v17.16b
umull2 v4.8h, v17.16b, v17.16b
uadalp v1.4s, v3.8h
uadalp v2.4s, v4.8h
endfunc
function x264_var_end
add v1.4s, v1.4s, v2.4s
uaddlv s0, v0.8h
uaddlv d1, v1.4s
mov w0, v0.s[0]
mov x1, v1.d[0]
orr x0, x0, x1, lsl #32
ret
endfunc
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
mov x5, \h - 4
usubl v6.8h, v16.8b, v18.8b
usubl v7.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smull v2.4s, v6.4h, v6.4h
smull2 v3.4s, v6.8h, v6.8h
add v0.8h, v6.8h, v7.8h
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
1: subs x5, x5, #2
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
add v0.8h, v0.8h, v7.8h
b.gt 1b
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
smlal v2.4s, v7.4h, v7.4h
add v0.8h, v0.8h, v7.8h
smlal2 v3.4s, v7.8h, v7.8h
saddlv s0, v0.8h
add v2.4s, v2.4s, v3.4s
mov w0, v0.s[0]
addv s1, v2.4s
sxtw x0, w0
mov w1, v1.s[0]
mul x0, x0, x0
str w1, [x4]
sub x0, x1, x0, lsr # 6 + (\h >> 4)
ret
endfunc
.endm
pixel_var2_8 8
pixel_var2_8 16
function x264_pixel_satd_4x4_neon, export=1
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v3.s}[0], [x2], x3
ld1 {v2.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
ld1 {v3.s}[1], [x2], x3
ld1 {v2.s}[1], [x0], x1
usubl v0.8h, v0.8b, v1.8b
usubl v1.8h, v2.8b, v3.8b
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
zip1 v0.2d, v2.2d, v3.2d
zip2 v1.2d, v2.2d, v3.2d
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
trn1 v0.8h, v2.8h, v3.8h
trn2 v1.8h, v2.8h, v3.8h
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
trn1 v0.4s, v2.4s, v3.4s
trn2 v1.4s, v2.4s, v3.4s
abs v0.8h, v0.8h
abs v1.8h, v1.8h
umax v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret
endfunc
function x264_pixel_satd_4x8_neon, export=1
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v3.s}[0], [x2], x3
ld1 {v2.s}[0], [x0], x1
ld1 {v5.s}[0], [x2], x3
ld1 {v4.s}[0], [x0], x1
ld1 {v7.s}[0], [x2], x3
ld1 {v6.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
ld1 {v3.s}[1], [x2], x3
ld1 {v2.s}[1], [x0], x1
ld1 {v5.s}[1], [x2], x3
ld1 {v4.s}[1], [x0], x1
ld1 {v7.s}[1], [x2], x3
ld1 {v6.s}[1], [x0], x1
b x264_satd_4x8_8x4_end_neon
endfunc
function x264_pixel_satd_8x4_neon, export=1
ld1 {v1.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
ld1 {v5.8b}, [x2], x3
ld1 {v4.8b}, [x0], x1
ld1 {v7.8b}, [x2], x3
ld1 {v6.8b}, [x0], x1
endfunc
function x264_satd_4x8_8x4_end_neon
usubl v0.8h, v0.8b, v1.8b
usubl v1.8h, v2.8b, v3.8b
usubl v2.8h, v4.8b, v5.8b
usubl v3.8h, v6.8b, v7.8b
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
trn1 v0.8h, v4.8h, v5.8h
trn2 v1.8h, v4.8h, v5.8h
trn1 v2.8h, v6.8h, v7.8h
trn2 v3.8h, v6.8h, v7.8h
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
trn1 v0.4s, v16.4s, v18.4s
trn2 v1.4s, v16.4s, v18.4s
trn1 v2.4s, v17.4s, v19.4s
trn2 v3.4s, v17.4s, v19.4s
abs v0.8h, v0.8h
abs v1.8h, v1.8h
abs v2.8h, v2.8h
abs v3.8h, v3.8h
umax v0.8h, v0.8h, v1.8h
umax v1.8h, v2.8h, v3.8h
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret
endfunc
function x264_pixel_satd_8x8_neon, export=1
mov x4, x30
bl x264_satd_8x8_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret x4
endfunc
function x264_pixel_satd_8x16_neon, export=1
mov x4, x30
bl x264_satd_8x8_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h
add v30.8h, v0.8h, v1.8h
bl x264_satd_8x8_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h
add v31.8h, v0.8h, v1.8h
add v0.8h, v30.8h, v31.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret x4
endfunc
.macro SUMSUBL_AB sum, sub, a, b
uaddl \sum, \a, \b
usubl \sub, \a, \b
.endm
.macro load_diff_fly_8x8
ld1 {v1.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
usubl v16.8h, v0.8b, v1.8b
ld1 {v5.8b}, [x2], x3
ld1 {v4.8b}, [x0], x1
usubl v17.8h, v2.8b, v3.8b
ld1 {v7.8b}, [x2], x3
ld1 {v6.8b}, [x0], x1
usubl v18.8h, v4.8b, v5.8b
ld1 {v1.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
usubl v19.8h, v6.8b, v7.8b
ld1 {v3.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
usubl v20.8h, v0.8b, v1.8b
ld1 {v5.8b}, [x2], x3
ld1 {v4.8b}, [x0], x1
usubl v21.8h, v2.8b, v3.8b
ld1 {v7.8b}, [x2], x3
ld1 {v6.8b}, [x0], x1
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
usubl v22.8h, v4.8b, v5.8b
usubl v23.8h, v6.8b, v7.8b
.endm
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm
function x264_satd_8x8_neon
load_diff_fly_8x8
endfunc
// one vertical hadamard pass and two horizontal
function x264_satd_8x4v_8x8h_neon
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
transpose v0.4s, v2.4s, v16.4s, v18.4s
transpose v1.4s, v3.4s, v17.4s, v19.4s
transpose v4.4s, v6.4s, v20.4s, v22.4s
transpose v5.4s, v7.4s, v21.4s, v23.4s
abs v0.8h, v0.8h
abs v1.8h, v1.8h
abs v2.8h, v2.8h
abs v3.8h, v3.8h
abs v4.8h, v4.8h
abs v5.8h, v5.8h
abs v6.8h, v6.8h
abs v7.8h, v7.8h
umax v0.8h, v0.8h, v2.8h
umax v1.8h, v1.8h, v3.8h
umax v2.8h, v4.8h, v6.8h
umax v3.8h, v5.8h, v7.8h
ret
endfunc
function x264_pixel_satd_16x8_neon, export=1
mov x4, x30
bl x264_satd_16x4_neon
add v30.8h, v0.8h, v1.8h
add v31.8h, v2.8h, v3.8h
bl x264_satd_16x4_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h
add v30.8h, v30.8h, v0.8h
add v31.8h, v31.8h, v1.8h
add v0.8h, v30.8h, v31.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret x4
endfunc
function x264_pixel_satd_16x16_neon, export=1
mov x4, x30
bl x264_satd_16x4_neon
add v30.8h, v0.8h, v1.8h
add v31.8h, v2.8h, v3.8h
bl x264_satd_16x4_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h
add v30.8h, v30.8h, v0.8h
add v31.8h, v31.8h, v1.8h
bl x264_satd_16x4_neon
add v0.8h, v0.8h, v1.8h
add v1.8h, v2.8h, v3.8h