Commit 556b0e79 authored by Janne Grunau's avatar Janne Grunau Committed by Fiona Glaser

aarch64: motion compensation NEON asm

Ported from the ARM NEON asm.
parent 6cda4398
......@@ -127,9 +127,10 @@ endif
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
ASMSRC += common/aarch64/dct-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/quant-a.S
SRCS +=
SRCS += common/aarch64/mc-c.c
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
......
/*****************************************************************************
* mc.S: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* Mans Rullgard <mans@mansr.com>
* Stefan Groenroos <stefan.gronroos@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
// note: prefetch stuff assumes 64-byte cacheline
// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
function x264_prefetch_ref_aarch64, export=1
cmp w2, #1
csel x2, xzr, x1, eq
add x0, x0, #64
add x0, x0, x2, lsl #3
lsl x2, x1, #1
add x3, x1, x1, lsl #1
add x4, x0, x1, lsl #2
prfm pldl1strm, [x0]
prfm pldl1strm, [x0, x1]
prfm pldl1strm, [x0, x2]
prfm pldl1strm, [x0, x3]
prfm pldl1strm, [x4]
prfm pldl1strm, [x4, x1]
prfm pldl1strm, [x4, x2]
prfm pldl1strm, [x4, x3]
ret
endfunc
// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
.macro x264_prefetch_fenc sub
function x264_prefetch_fenc_\sub\()_aarch64, export=1
and w6, w5, #3
and w7, w5, #3
mul x6, x6, x1
mul x7, x7, x3
add x0, x0, #64
add x2, x2, #64
add x0, x0, x6, lsl #2
add x6, x0, x1, lsl #1
prfm pldl1strm, [x0]
prfm pldl1strm, [x0, x1]
prfm pldl1strm, [x6]
prfm pldl1strm, [x6, x1]
add x2, x2, x7, lsl #1
prfm pldl1strm, [x2]
prfm pldl1strm, [x2, x3]
.ifc \sub, 422
add x7, x2, x3, lsl #1
prfm pldl1strm, [x7]
prfm pldl1strm, [x7, x3]
.endif
ret
endfunc
.endm
x264_prefetch_fenc 420
x264_prefetch_fenc 422
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
// uint8_t *src1, intptr_t src1_stride,
// uint8_t *src2, intptr_t src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon, export=1
mov w10, #64
cmp w6, #32
mov w9, #\h
b.eq pixel_avg_w\w\()_neon
subs w7, w10, w6
b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
cmp w6, #0
b.ge pixel_avg_weight_w\w\()_add_add_neon
b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
endfunc
.endm
AVGH 4, 2
AVGH 4, 4
AVGH 4, 8
AVGH 4, 16
AVGH 8, 4
AVGH 8, 8
AVGH 8, 16
AVGH 16, 8
AVGH 16, 16
// 0 < weight < 64
.macro load_weights_add_add
mov w6, w6
.endm
.macro weight_add_add dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s1, v30.16b
umlal2 \dst, \s2, v31.16b
.else
umull \dst, \s1, v30.8b
umlal \dst, \s2, v31.8b
.endif
.endm
// weight > 64
.macro load_weights_add_sub
neg w7, w7
.endm
.macro weight_add_sub dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s1, v30.16b
umlsl2 \dst, \s2, v31.16b
.else
umull \dst, \s1, v30.8b
umlsl \dst, \s2, v31.8b
.endif
.endm
// weight < 0
.macro load_weights_sub_add
neg w6, w6
.endm
.macro weight_sub_add dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s2, v31.16b
umlsl2 \dst, \s1, v30.16b
.else
umull \dst, \s2, v31.8b
umlsl \dst, \s1, v30.8b
.endif
.endm
.macro AVG_WEIGHT ext
function pixel_avg_weight_w4_\ext\()_neon
load_weights_\ext
dup v30.8b, w6
dup v31.8b, w7
1: // height loop
subs w9, w9, #2
ld1 {v0.s}[0], [x2], x3
ld1 {v1.s}[0], [x4], x5
weight_\ext v4.8h, v0.8b, v1.8b
ld1 {v2.s}[0], [x2], x3
ld1 {v3.s}[0], [x4], x5
sqrshrun v0.8b, v4.8h, #6
weight_\ext v5.8h, v2.8b, v3.8b
st1 {v0.s}[0], [x0], x1
sqrshrun v1.8b, v5.8h, #6
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
function pixel_avg_weight_w8_\ext\()_neon
load_weights_\ext
dup v30.8b, w6
dup v31.8b, w7
1: // height loop
subs w9, w9, #4
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x4], x5
weight_\ext v16.8h, v0.8b, v1.8b
ld1 {v2.8b}, [x2], x3
ld1 {v3.8b}, [x4], x5
weight_\ext v17.8h, v2.8b, v3.8b
ld1 {v4.8b}, [x2], x3
ld1 {v5.8b}, [x4], x5
weight_\ext v18.8h, v4.8b, v5.8b
ld1 {v6.8b}, [x2], x3
ld1 {v7.8b}, [x4], x5
weight_\ext v19.8h, v6.8b, v7.8b
sqrshrun v0.8b, v16.8h, #6
sqrshrun v1.8b, v17.8h, #6
sqrshrun v2.8b, v18.8h, #6
sqrshrun v3.8b, v19.8h, #6
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
b.gt 1b
ret
endfunc
function pixel_avg_weight_w16_\ext\()_neon
load_weights_\ext
dup v30.16b, w6
dup v31.16b, w7
1: // height loop
subs w9, w9, #2
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x4], x5
weight_\ext v16.8h, v0.8b, v1.8b
weight_\ext v17.8h, v0.16b, v1.16b, 2
ld1 {v2.16b}, [x2], x3
ld1 {v3.16b}, [x4], x5
weight_\ext v18.8h, v2.8b, v3.8b
weight_\ext v19.8h, v2.16b, v3.16b, 2
sqrshrun v0.8b, v16.8h, #6
sqrshrun v1.8b, v18.8h, #6
sqrshrun2 v0.16b, v17.8h, #6
sqrshrun2 v1.16b, v19.8h, #6
st1 {v0.16b}, [x0], x1
st1 {v1.16b}, [x0], x1
b.gt 1b
ret
endfunc
.endm
AVG_WEIGHT add_add
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add
function pixel_avg_w4_neon
1: subs w9, w9, #2
ld1 {v0.s}[0], [x2], x3
ld1 {v2.s}[0], [x4], x5
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.s}[0], [x2], x3
ld1 {v3.s}[0], [x4], x5
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
function pixel_avg_w8_neon
1: subs w9, w9, #4
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x4], x5
ld1 {v2.8b}, [x2], x3
urhadd v0.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x4], x5
st1 {v0.8b}, [x0], x1
ld1 {v4.8b}, [x2], x3
urhadd v1.8b, v2.8b, v3.8b
ld1 {v5.8b}, [x4], x5
st1 {v1.8b}, [x0], x1
ld1 {v6.8b}, [x2], x3
ld1 {v7.8b}, [x4], x5
urhadd v2.8b, v4.8b, v5.8b
urhadd v3.8b, v6.8b, v7.8b
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
b.gt 1b
ret
endfunc
function pixel_avg_w16_neon
1: subs w9, w9, #4
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x4], x5
ld1 {v2.16b}, [x2], x3
urhadd v0.16b, v0.16b, v1.16b
ld1 {v3.16b}, [x4], x5
st1 {v0.16b}, [x0], x1
ld1 {v4.16b}, [x2], x3
urhadd v1.16b, v2.16b, v3.16b
ld1 {v5.16b}, [x4], x5
st1 {v1.16b}, [x0], x1
ld1 {v6.16b}, [x2], x3
ld1 {v7.16b}, [x4], x5
urhadd v2.16b, v4.16b, v5.16b
urhadd v3.16b, v6.16b, v7.16b
st1 {v2.16b}, [x0], x1
st1 {v3.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_pixel_avg2_w4_neon, export=1
1:
subs w5, w5, #2
ld1 {v0.s}[0], [x2], x3
ld1 {v2.s}[0], [x4], x3
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.s}[0], [x2], x3
ld1 {v3.s}[0], [x4], x3
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
function x264_pixel_avg2_w8_neon, export=1
1:
subs w5, w5, #2
ld1 {v0.8b}, [x2], x3
ld1 {v2.8b}, [x4], x3
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.8b}, [x2], x3
ld1 {v3.8b}, [x4], x3
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_pixel_avg2_w16_neon, export=1
1:
subs w5, w5, #2
ld1 {v0.16b}, [x2], x3
ld1 {v2.16b}, [x4], x3
urhadd v0.16b, v0.16b, v2.16b
ld1 {v1.16b}, [x2], x3
ld1 {v3.16b}, [x4], x3
urhadd v1.16b, v1.16b, v3.16b
st1 {v0.16b}, [x0], x1
st1 {v1.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_pixel_avg2_w20_neon, export=1
sub x1, x1, #16
1:
subs w5, w5, #2
ld1 {v0.16b,v1.16b}, [x2], x3
ld1 {v2.16b,v3.16b}, [x4], x3
urhadd v0.16b, v0.16b, v2.16b
urhadd v1.8b, v1.8b, v3.8b
ld1 {v4.16b,v5.16b}, [x2], x3
ld1 {v6.16b,v7.16b}, [x4], x3
urhadd v4.16b, v4.16b, v6.16b
urhadd v5.8b, v5.8b, v7.8b
st1 {v0.16b}, [x0], #16
st1 {v1.s}[0], [x0], x1
st1 {v4.16b}, [x0], #16
st1 {v5.s}[0], [x0], x1
b.gt 1b
ret
endfunc
.macro weight_prologue type
mov w9, w5 // height
.ifc \type, full
ldr w12, [x4, #32] // denom
.endif
ldp w4, w5, [x4, #32+4] // scale, offset
dup v0.16b, w4
dup v1.8h, w5
.ifc \type, full
neg w12, w12
dup v2.8h, w12
.endif
.endm
// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
// intptr_t dst_stride, const x264_weight_t *weight, int h )
function x264_mc_weight_w20_neon, export=1
weight_prologue full
sub x1, x1, #16
1:
subs w9, w9, #2
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
umull v22.8h, v16.8b, v0.8b
umull v23.8h, v17.8b, v0.8b
zip1 v18.2s, v18.2s, v21.2s
umull v25.8h, v19.8b, v0.8b
umull v26.8h, v20.8b, v0.8b
umull v24.8h, v18.8b, v0.8b
srshl v22.8h, v22.8h, v2.8h
srshl v23.8h, v23.8h, v2.8h
srshl v24.8h, v24.8h, v2.8h
srshl v25.8h, v25.8h, v2.8h
srshl v26.8h, v26.8h, v2.8h
add v22.8h, v22.8h, v1.8h
add v23.8h, v23.8h, v1.8h
add v24.8h, v24.8h, v1.8h
add v25.8h, v25.8h, v1.8h
add v26.8h, v26.8h, v1.8h
sqxtun v4.8b, v22.8h
sqxtun2 v4.16b, v23.8h
sqxtun v6.8b, v24.8h
sqxtun v5.8b, v25.8h
sqxtun2 v5.16b, v26.8h
st1 {v4.16b}, [x0], #16
st1 {v6.s}[0], [x0], x1
st1 {v5.16b}, [x0], #16
st1 {v6.s}[1], [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w16_neon, export=1
weight_prologue full
weight16_loop:
1:
subs w9, w9, #2
ld1 {v4.16b}, [x2], x3
ld1 {v5.16b}, [x2], x3
umull v22.8h, v4.8b, v0.8b
umull2 v23.8h, v4.16b, v0.16b
umull v24.8h, v5.8b, v0.8b
umull2 v25.8h, v5.16b, v0.16b
srshl v22.8h, v22.8h, v2.8h
srshl v23.8h, v23.8h, v2.8h
srshl v24.8h, v24.8h, v2.8h
srshl v25.8h, v25.8h, v2.8h
add v22.8h, v22.8h, v1.8h
add v23.8h, v23.8h, v1.8h
add v24.8h, v24.8h, v1.8h
add v25.8h, v25.8h, v1.8h
sqxtun v4.8b, v22.8h
sqxtun2 v4.16b, v23.8h
sqxtun v5.8b, v24.8h
sqxtun2 v5.16b, v25.8h
st1 {v4.16b}, [x0], x1
st1 {v5.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w8_neon, export=1
weight_prologue full
1:
subs w9, w9, #2
ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3
umull v4.8h, v16.8b, v0.8b
umull v5.8h, v17.8b, v0.8b
srshl v4.8h, v4.8h, v2.8h
srshl v5.8h, v5.8h, v2.8h
add v4.8h, v4.8h, v1.8h
add v5.8h, v5.8h, v1.8h
sqxtun v16.8b, v4.8h
sqxtun v17.8b, v5.8h
st1 {v16.8b}, [x0], x1
st1 {v17.8b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w4_neon, export=1
weight_prologue full
1:
subs w9, w9, #2
ld1 {v16.s}[0], [x2], x3
ld1 {v16.s}[1], [x2], x3
umull v4.8h, v16.8b, v0.8b
srshl v4.8h, v4.8h, v2.8h
add v4.8h, v4.8h, v1.8h
sqxtun v16.8b, v4.8h
st1 {v16.s}[0], [x0], x1
st1 {v16.s}[1], [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w20_nodenom_neon, export=1
weight_prologue nodenom
sub x1, x1, #16
1:
subs w9, w9, #2
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
mov v27.16b, v1.16b
mov v28.16b, v1.16b
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
mov v31.16b, v1.16b
mov v29.16b, v1.16b
mov v30.16b, v1.16b
zip1 v18.2s, v18.2s, v21.2s
umlal v27.8h, v16.8b, v0.8b
umlal v28.8h, v17.8b, v0.8b
umlal v31.8h, v18.8b, v0.8b
umlal v29.8h, v19.8b, v0.8b
umlal v30.8h, v20.8b, v0.8b
sqxtun v4.8b, v27.8h
sqxtun2 v4.16b, v28.8h
sqxtun v5.8b, v29.8h
sqxtun2 v5.16b, v30.8h
sqxtun v6.8b, v31.8h
st1 {v4.16b}, [x0], #16
st1 {v6.s}[0], [x0], x1
st1 {v5.16b}, [x0], #16
st1 {v6.s}[1], [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w16_nodenom_neon, export=1
weight_prologue nodenom
1:
subs w9, w9, #2
ld1 {v6.16b}, [x2], x3
mov v27.16b, v1.16b
mov v28.16b, v1.16b
ld1 {v7.16b}, [x2], x3
mov v29.16b, v1.16b
mov v30.16b, v1.16b
umlal v27.8h, v6.8b, v0.8b
umlal2 v28.8h, v6.16b, v0.16b
umlal v29.8h, v7.8b, v0.8b
umlal2 v30.8h, v7.16b, v0.16b
sqxtun v4.8b, v27.8h
sqxtun2 v4.16b, v28.8h
sqxtun v5.8b, v29.8h
sqxtun2 v5.16b, v30.8h
st1 {v4.16b}, [x0], x1
st1 {v5.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w8_nodenom_neon, export=1
weight_prologue nodenom
1:
subs w9, w9, #2
ld1 {v16.8b}, [x2], x3
mov v27.16b, v1.16b
ld1 {v17.8b}, [x2], x3
mov v29.16b, v1.16b
umlal v27.8h, v16.8b, v0.8b
umlal v29.8h, v17.8b, v0.8b
sqxtun v4.8b, v27.8h
sqxtun v5.8b, v29.8h
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w4_nodenom_neon, export=1
weight_prologue nodenom
1:
subs w9, w9, #2
ld1 {v16.s}[0], [x2], x3
ld1 {v16.s}[1], [x2], x3
mov v27.16b, v1.16b
umlal v27.8h, v16.8b, v0.8b
sqxtun v4.8b, v27.8h
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
b.gt 1b
ret
endfunc
.macro weight_simple_prologue
ldr w6, [x4] // offset
dup v1.16b, w6
.endm
.macro weight_simple name op
function x264_mc_weight_w20_\name\()_neon, export=1
weight_simple_prologue
1:
subs w5, w5, #2
ldr s18, [x2, #16]
ld1 {v16.16b}, [x2], x3
ldr s19, [x2, #16]
ld1 {v17.16b}, [x2], x3
\op v18.8b, v18.8b, v1.8b
\op v16.16b, v16.16b, v1.16b
\op v19.8b, v19.8b, v1.8b
\op v17.16b, v17.16b, v1.16b
str s18, [x0, #16]
st1 {v16.16b}, [x0], x1
str s19, [x0, #16]
st1 {v17.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w16_\name\()_neon, export=1
weight_simple_prologue
1:
subs w5, w5, #2
ld1 {v16.16b}, [x2], x3
ld1 {v17.16b}, [x2], x3
\op v16.16b, v16.16b, v1.16b
\op v17.16b, v17.16b, v1.16b
st1 {v16.16b}, [x0], x1
st1 {v17.16b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w8_\name\()_neon, export=1
weight_simple_prologue
1:
subs w5, w5, #2
ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3
\op v16.8b, v16.8b, v1.8b
\op v17.8b, v17.8b, v1.8b
st1 {v16.8b}, [x0], x1
st1 {v17.8b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_weight_w4_\name\()_neon, export=1
weight_simple_prologue
1:
subs w5, w5, #2
ld1 {v16.s}[0], [x2], x3
ld1 {v16.s}[1], [x2], x3
\op v16.8b, v16.8b, v1.8b
st1 {v16.s}[0], [x0], x1
st1 {v16.s}[1], [x0], x1
b.gt 1b
ret
endfunc
.endm
weight_simple offsetadd, uqadd
weight_simple offsetsub, uqsub
// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
function x264_mc_copy_w4_neon, export=1
1:
subs w4, w4, #4
ld1 {v0.s}[0], [x2], x3
ld1 {v1.s}[0], [x2], x3
ld1 {v2.s}[0], [x2], x3
ld1 {v3.s}[0], [x2], x3
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
st1 {v2.s}[0], [x0], x1
st1 {v3.s}[0], [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_copy_w8_neon, export=1
1: subs w4, w4, #4
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x2], x3
ld1 {v2.8b}, [x2], x3
ld1 {v3.8b}, [x2], x3
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
b.gt 1b
ret
endfunc
function x264_mc_copy_w16_neon, export=1
1: subs w4, w4, #4
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x2], x3
ld1 {v3.16b}, [x2], x3
st1 {v0.16b}, [x0], x1
st1 {v1.16b}, [x0], x1
st1 {v2.16b}, [x0], x1
st1 {v3.16b}, [x0], x1
b.gt 1b
ret
endfunc
// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
// intptr_t i_dst_stride,
// uint8_t *src, intptr_t i_src_stride,
// int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon, export=1
ldr w15, [sp] // height
sbfx x12, x6, #3, #29 // asr(3) and sign extend
sbfx x11, x5, #3, #29 // asr(3) and sign extend
cmp w7, #4
mul x12, x12, x4
add x3, x3, x11, lsl #1
and w5, w5, #7
and w6, w6, #7
add x3, x3, x12
//pld [x3]
//pld [x3, x4]
b.gt mc_chroma_w8_neon
b.eq mc_chroma_w4_neon
endfunc
.macro CHROMA_MC_START r00, r01, r10, r11
mul w12, w5, w6 // cD = d8x *d8y
lsl w13, w5, #3
add w9, w12, #64
lsl w14, w6, #3
tst w12, w12
sub w9, w9, w13
sub w10, w13, w12 // cB = d8x *(8-d8y);
sub w11, w14, w12 // cC = (8-d8x)*d8y
sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
.endm
.macro CHROMA_MC width, vsize
function mc_chroma_w\width\()_neon
// since the element size varies, there's a different index for the 2nd store
.if \width == 4
.set st2, 1
.else
.set st2, 2
.endif
CHROMA_MC_START
b.eq 2f
ld2 {v28.8b,v29.8b}, [x3], x4
dup v0.8b, w9 // cA
dup v1.8b, w10 // cB
ext v6.8b, v28.8b, v6.8b, #1
ext v7.8b, v29.8b, v7.8b, #1
ld2 {v30.8b,v31.8b}, [x3], x4
dup v2.8b, w11 // cC
dup v3.8b, w12 // cD
ext v22.8b, v30.8b, v22.8b, #1
ext v23.8b, v31.8b, v23.8b, #1
trn1 v0.2s, v0.2s, v1.2s
trn1 v2.2s, v2.2s, v3.2s
trn1 v4.2s, v28.2s, v6.2s
trn1 v5.2s, v29.2s, v7.2s
trn1 v20.2s, v30.2s, v22.2s
trn1 v21.2s, v31.2s, v23.2s
1: // height loop, interpolate xy
subs w15, w15, #2
umull v16.8h, v4.8b, v0.8b
umlal v16.8h, v20.8b, v2.8b
umull v17.8h, v5.8b, v0.8b
umlal v17.8h, v21.8b, v2.8b
ld2 {v28.8b,v29.8b}, [x3], x4
transpose v24.2d, v25.2d, v16.2d, v17.2d
ext v6.8b, v28.8b, v6.8b, #1
ext v7.8b, v29.8b, v7.8b, #1
trn1 v4.2s, v28.2s, v6.2s
trn1 v5.2s, v29.2s, v7.2s
add v16.8h, v24.8h, v25.8h
umull v18.8h, v20.8b, v0.8b
umlal v18.8h, v4.8b, v2.8b
umull v19.8h, v21.8b, v0.8b
umlal v19.8h, v5.8b, v2.8b
ld2 {v30.8b,v31.8b}, [x3], x4
transpose v26.2d, v27.2d, v18.2d, v19.2d
ext v22.8b, v30.8b, v22.8b, #1
ext v23.8b, v31.8b, v23.8b, #1
trn1 v20.2s, v30.2s, v22.2s
trn1 v21.2s, v31.2s, v23.2s
add v17.8h, v26.8h, v27.8h