Commit 6bf21c63 authored by David Conrad's avatar David Conrad Committed by Fiona Glaser

GSOC merge part 4: ARM NEON mc assembly functions

prefetch, memcpy_aligned, memzero_aligned, avg, mc_luma, get_ref, mc_chroma, hpel_filter, frame_init_lowres
parent 52f9719b
......@@ -58,7 +58,8 @@ endif
# NEON optims
ifeq ($(ARCH),ARM)
ifneq ($(AS),)
ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S
ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S
SRCS += common/arm/mc-c.c
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
......
/*****************************************************************************
* mc.S: h264 encoder
*****************************************************************************
* Copyright (C) 2009 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "asm.S"
.fpu neon
.text
// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
// They also use nothing above armv5te, but we don't care about pre-armv6
// void prefetch_ref( uint8_t *pix, int stride, int parity )
function x264_prefetch_ref_arm, export=1
sub r2, r2, #1
add r0, r0, #64
and r2, r2, r1
add r0, r0, r2, lsl #3
add r2, r1, r1, lsl #1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
add r3, r0, r1, lsl #2
pld [r0, r2]
pld [r3]
pld [r3, r1]
pld [r3, r1, lsl #1]
pld [r3, r2]
bx lr
.endfunc
// void prefetch_fenc( uint8_t *pix_y, int stride_y,
// uint8_t *pix_uv, int stride_uv, int mb_x )
function x264_prefetch_fenc_arm, export=1
ldr ip, [sp]
push {lr}
and lr, ip, #3
smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed
and ip, ip, #6
smulbb ip, ip, r3
add r0, r0, #64
add r2, r2, #64
add r0, r0, lr, lsl #2
pld [r0]
add lr, r0, r1, lsl #1
pld [r0, r1]
pld [lr]
add r2, r2, ip, lsl #2
pld [lr, r1]
pld [r2]
add ip, r2, r3, lsl #1
pld [r2, r3]
pld [ip]
pld [ip, r3]
pop {pc}
.endfunc
// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
function x264_memcpy_aligned_neon, export=1
orr r3, r0, r1, lsr #1
movrel ip, memcpy_table
and r3, r3, #0xc
ldr pc, [ip, r3]
.endfunc
.macro MEMCPY_ALIGNED srcalign dstalign
function memcpy_aligned_\dstalign\()_\srcalign\()_neon
mov r3, r0
.if \srcalign == 8 && \dstalign == 8
sub r2, #16
vld1.64 {d0}, [r1,:64]!
vst1.64 {d0}, [r3,:64]!
.set r1align, 128
.set r3align, 128
.else
.set r1align, \srcalign * 8
.set r3align, \dstalign * 8
.endif
tst r2, #16
beq 32f
sub r2, #16
vld1.64 {d0-d1}, [r1,:r1align]!
vst1.64 {d0-d1}, [r3,:r3align]!
32: // n is a multiple of 32
tst r2, #32
beq 64f
sub r2, #32
vld1.64 {d0-d3}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
64: // n is a multiple of 64
subs r2, #64
vld1.64 {d0-d3}, [r1,:r1align]!
vld1.64 {d4-d7}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
vst1.64 {d4-d7}, [r3,:r3align]!
bgt 64b
.if \srcalign == 8 && \dstalign == 8
vld1.64 {d0}, [r1,:64]!
vst1.64 {d0}, [r3,:64]!
.endif
bx lr
.endfunc
.endm
MEMCPY_ALIGNED 16, 16
MEMCPY_ALIGNED 16, 8
MEMCPY_ALIGNED 8, 16
MEMCPY_ALIGNED 8, 8
.section .rodata
memcpy_table:
.word memcpy_aligned_16_16_neon
.word memcpy_aligned_16_8_neon
.word memcpy_aligned_8_16_neon
.word memcpy_aligned_8_8_neon
.text
// void x264_memzero_aligned( void *dst, size_t n )
function x264_memzero_aligned_neon, export=1
vmov.i8 q0, #0
vmov.i8 q1, #0
memzero_loop:
subs r1, #128
.rept 4
vst1.64 {d0-d3}, [r0,:128]!
.endr
bgt memzero_loop
bx lr
.endfunc
// void pixel_avg( uint8_t *dst, int dst_stride,
// uint8_t *src1, int src1_stride,
// uint8_t *src2, int src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon, export=1
ldr ip, [sp, #8]
push {r4-r6,lr}
cmp ip, #32
ldrd r4, [sp, #16]
mov lr, #\h
beq x264_pixel_avg_w\w\()_neon
rsbs r6, ip, #64
blt x264_pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
cmp ip, #0
bge x264_pixel_avg_weight_w\w\()_add_add_neon
b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
.endfunc
.endm
AVGH 4, 2
AVGH 4, 4
AVGH 4, 8
AVGH 8, 4
AVGH 8, 8
AVGH 8, 16
AVGH 16, 8
AVGH 16, 16
// 0 < weight < 64
.macro load_weights_add_add
vdup.8 d30, ip
vdup.8 d31, r6
.endm
.macro load_add_add d1 d2
vld1.32 {\d1}, [r2], r3
vld1.32 {\d2}, [r4], r5
.endm
.macro weight_add_add dst s1 s2
vmull.u8 \dst, \s1, d30
vmlal.u8 \dst, \s2, d31
.endm
// weight > 64
.macro load_weights_add_sub
rsb r6, #0
vdup.8 d30, ip
vdup.8 d31, r6
.endm
.macro load_add_sub d1 d2
vld1.32 {\d1}, [r2], r3
vld1.32 {\d2}, [r4], r5
.endm
.macro weight_add_sub dst s1 s2
vmull.u8 \dst, \s1, d30
vmlsl.u8 \dst, \s2, d31
.endm
// weight < 0
.macro load_weights_sub_add
rsb ip, #0
vdup.8 d31, r6
vdup.8 d30, ip
.endm
.macro load_sub_add d1 d2
vld1.32 {\d2}, [r4], r5
vld1.32 {\d1}, [r2], r3
.endm
.macro weight_sub_add dst s1 s2
vmull.u8 \dst, \s2, d31
vmlsl.u8 \dst, \s1, d30
.endm
.macro AVG_WEIGHT ext
function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
load_weights_\ext
1: // height loop
subs lr, lr, #2
load_\ext d0[], d1[]
weight_\ext q8, d0, d1
load_\ext d2[], d3[]
vqrshrun.s16 d0, q8, #6
weight_\ext q9, d2, d3
vst1.32 {d0[0]}, [r0,:32], r1
vqrshrun.s16 d1, q9, #6
vst1.32 {d1[0]}, [r0,:32], r1
bgt 1b
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
load_weights_\ext
1: // height loop
subs lr, lr, #4
load_\ext d0, d1
weight_\ext q8, d0, d1
load_\ext d2, d3
weight_\ext q9, d2, d3
load_\ext d4, d5
weight_\ext q10, d4, d5
load_\ext d6, d7
weight_\ext q11, d6, d7
vqrshrun.s16 d0, q8, #6
vqrshrun.s16 d1, q9, #6
vqrshrun.s16 d2, q10, #6
vqrshrun.s16 d3, q11, #6
vst1.64 {d0}, [r0,:64], r1
vst1.64 {d1}, [r0,:64], r1
vst1.64 {d2}, [r0,:64], r1
vst1.64 {d3}, [r0,:64], r1
bgt 1b
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_weight_w16_\ext\()_neon, export=1
load_weights_\ext
1: // height loop
subs lr, lr, #2
load_\ext d0-d1, d2-d3
weight_\ext q8, d0, d2
weight_\ext q9, d1, d3
load_\ext d4-d5, d6-d7
weight_\ext q10, d4, d6
weight_\ext q11, d5, d7
vqrshrun.s16 d0, q8, #6
vqrshrun.s16 d1, q9, #6
vqrshrun.s16 d2, q10, #6
vqrshrun.s16 d3, q11, #6
vst1.64 {d0-d1}, [r0,:128], r1
vst1.64 {d2-d3}, [r0,:128], r1
bgt 1b
pop {r4-r6,pc}
.endfunc
.endm
AVG_WEIGHT add_add
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add
function x264_pixel_avg_w4_neon, export=1
subs lr, lr, #2
vld1.32 {d0[]}, [r2], r3
vld1.32 {d2[]}, [r4], r5
vrhadd.u8 d0, d0, d2
vld1.32 {d1[]}, [r2], r3
vld1.32 {d3[]}, [r4], r5
vrhadd.u8 d1, d1, d3
vst1.32 {d0[0]}, [r0,:32], r1
vst1.32 {d1[0]}, [r0,:32], r1
bgt x264_pixel_avg_w4_neon
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_w8_neon, export=1
subs lr, lr, #4
vld1.64 {d0}, [r2], r3
vld1.64 {d2}, [r4], r5
vrhadd.u8 d0, d0, d2
vld1.64 {d1}, [r2], r3
vld1.64 {d3}, [r4], r5
vrhadd.u8 d1, d1, d3
vst1.64 {d0}, [r0,:64], r1
vld1.64 {d2}, [r2], r3
vld1.64 {d4}, [r4], r5
vrhadd.u8 d2, d2, d4
vst1.64 {d1}, [r0,:64], r1
vld1.64 {d3}, [r2], r3
vld1.64 {d5}, [r4], r5
vrhadd.u8 d3, d3, d5
vst1.64 {d2}, [r0,:64], r1
vst1.64 {d3}, [r0,:64], r1
bgt x264_pixel_avg_w8_neon
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_w16_neon, export=1
subs lr, lr, #4
vld1.64 {d0-d1}, [r2], r3
vld1.64 {d2-d3}, [r4], r5
vrhadd.u8 q0, q0, q1
vld1.64 {d2-d3}, [r2], r3
vld1.64 {d4-d5}, [r4], r5
vrhadd.u8 q1, q1, q2
vst1.64 {d0-d1}, [r0,:128], r1
vld1.64 {d4-d5}, [r2], r3
vld1.64 {d6-d7}, [r4], r5
vrhadd.u8 q2, q2, q3
vst1.64 {d2-d3}, [r0,:128], r1
vld1.64 {d6-d7}, [r2], r3
vld1.64 {d0-d1}, [r4], r5
vrhadd.u8 q3, q3, q0
vst1.64 {d4-d5}, [r0,:128], r1
vst1.64 {d6-d7}, [r0,:128], r1
bgt x264_pixel_avg_w16_neon
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg2_w4_neon, export=1
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
avg2_w4_loop:
subs ip, ip, #2
vld1.32 {d0[]}, [r2], r3
vld1.32 {d2[]}, [lr], r3
vrhadd.u8 d0, d0, d2
vld1.32 {d1[]}, [r2], r3
vld1.32 {d3[]}, [lr], r3
vrhadd.u8 d1, d1, d3
vst1.32 {d0[0]}, [r0,:32], r1
vst1.32 {d1[0]}, [r0,:32], r1
bgt avg2_w4_loop
pop {pc}
.endfunc
function x264_pixel_avg2_w8_neon, export=1
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
avg2_w8_loop:
subs ip, ip, #2
vld1.64 {d0}, [r2], r3
vld1.64 {d2}, [lr], r3
vrhadd.u8 d0, d0, d2
vld1.64 {d1}, [r2], r3
vld1.64 {d3}, [lr], r3
vrhadd.u8 d1, d1, d3
vst1.64 {d0}, [r0,:64], r1
vst1.64 {d1}, [r0,:64], r1
bgt avg2_w8_loop
pop {pc}
.endfunc
function x264_pixel_avg2_w16_neon, export=1
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
avg2_w16_loop:
subs ip, ip, #2
vld1.64 {d0-d1}, [r2], r3
vld1.64 {d2-d3}, [lr], r3
vrhadd.u8 q0, q0, q1
vld1.64 {d4-d5}, [r2], r3
vld1.64 {d6-d7}, [lr], r3
vrhadd.u8 q2, q2, q3
vst1.64 {d0-d1}, [r0,:128], r1
vst1.64 {d4-d5}, [r0,:128], r1
bgt avg2_w16_loop
pop {pc}
.endfunc
function x264_pixel_avg2_w20_neon, export=1
ldr ip, [sp, #4]
push {lr}
sub r1, r1, #16
ldr lr, [sp, #4]
avg2_w20_loop:
subs ip, ip, #2
vld1.64 {d0-d2}, [r2], r3
vld1.64 {d4-d6}, [lr], r3
vrhadd.u8 q0, q0, q2
vrhadd.u8 d2, d2, d6
vld1.64 {d4-d6}, [r2], r3
vld1.64 {d16-d18},[lr], r3
vrhadd.u8 q2, q2, q8
vst1.64 {d0-d1}, [r0,:128]!
vrhadd.u8 d6, d6, d18
vst1.32 {d2[0]}, [r0,:32], r1
vst1.64 {d4-d5}, [r0,:128]!
vst1.32 {d6[0]}, [r0,:32], r1
bgt avg2_w20_loop
pop {pc}
.endfunc
// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height )
function x264_mc_copy_w4_neon, export=1
ldr ip, [sp]
copy_w4_loop:
subs ip, ip, #4
vld1.32 {d0[]}, [r2], r3
vld1.32 {d1[]}, [r2], r3
vld1.32 {d2[]}, [r2], r3
vld1.32 {d3[]}, [r2], r3
vst1.32 {d0[0]}, [r0,:32], r1
vst1.32 {d1[0]}, [r0,:32], r1
vst1.32 {d2[0]}, [r0,:32], r1
vst1.32 {d3[0]}, [r0,:32], r1
bgt copy_w4_loop
bx lr
.endfunc
function x264_mc_copy_w8_neon, export=1
ldr ip, [sp]
copy_w8_loop:
subs ip, ip, #4
vld1.32 {d0}, [r2], r3
vld1.32 {d1}, [r2], r3
vld1.32 {d2}, [r2], r3
vld1.32 {d3}, [r2], r3
vst1.32 {d0}, [r0,:64], r1
vst1.32 {d1}, [r0,:64], r1
vst1.32 {d2}, [r0,:64], r1
vst1.32 {d3}, [r0,:64], r1
bgt copy_w8_loop
bx lr
.endfunc
function x264_mc_copy_w16_neon, export=1
ldr ip, [sp]
copy_w16_loop:
subs ip, ip, #4
vld1.32 {d0-d1}, [r2], r3
vld1.32 {d2-d3}, [r2], r3
vld1.32 {d4-d5}, [r2], r3
vld1.32 {d6-d7}, [r2], r3
vst1.32 {d0-d1}, [r0,:128], r1
vst1.32 {d2-d3}, [r0,:128], r1
vst1.32 {d4-d5}, [r0,:128], r1
vst1.32 {d6-d7}, [r0,:128], r1
bgt copy_w16_loop
bx lr
.endfunc
function x264_mc_copy_w16_aligned_neon, export=1
ldr ip, [sp]
copy_w16_aligned_loop:
subs ip, ip, #4
vld1.32 {d0-d1}, [r2,:128], r3
vld1.32 {d2-d3}, [r2,:128], r3
vld1.32 {d4-d5}, [r2,:128], r3
vld1.32 {d6-d7}, [r2,:128], r3
vst1.32 {d0-d1}, [r0,:128], r1
vst1.32 {d2-d3}, [r0,:128], r1
vst1.32 {d4-d5}, [r0,:128], r1
vst1.32 {d6-d7}, [r0,:128], r1
bgt copy_w16_aligned_loop
bx lr
.endfunc
// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
// uint8_t *src, int i_src_stride,
// int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon, export=1
push {r4-r6, lr}
ldrd r4, [sp, #16]
ldr r6, [sp, #24]
asr lr, r5, #3
mul lr, r3, lr
add r2, r2, r4, asr #3
cmp r6, #4
add r2, r2, lr
and r4, r4, #7
and r5, r5, #7
pld [r2]
pld [r2, r3]
bgt mc_chroma_w8
beq mc_chroma_w4
// calculate cA cB cC cD
.macro CHROMA_MC_START r0 r1
muls lr, r4, r5
rsb r6, lr, r5, lsl #3
rsb ip, lr, r4, lsl #3
sub r4, lr, r4, lsl #3
sub r4, r4, r5, lsl #3
add r4, r4, #64
beq 2f
add r5, r2, r3
vdup.8 d0, r4
lsl r3, r3, #1
vdup.8 d1, ip
vld1.64 {\r0}, [r2], r3
vdup.8 d2, r6
vld1.64 {\r1}, [r5], r3
vdup.8 d3, lr
ldr r4, [sp, #28]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
.endm
.macro CHROMA_MC width, align
mc_chroma_w\width:
CHROMA_MC_START d4, d6
// since the element size varies, there's a different index for the 2nd store
.if \width == 4
.set st2, 1
.else
.set st2, 2
.endif
vtrn.32 d4, d5
vtrn.32 d6, d7
vtrn.32 d0, d1
vtrn.32 d2, d3
1: // height loop, interpolate xy
pld [r5]
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2
vld1.64 {d4}, [r2], r3
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2
vld1.64 {d6}, [r5], r3
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
vrshrn.u16 d16, q8, #6
subs r4, r4, #2
pld [r2]
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
vst1.\align {d16[0]}, [r0,:\align], r1
vst1.\align {d16[st2]}, [r0,:\align], r1
bgt 1b
pop {r4-r6, pc}
2: // dx or dy are 0
tst r6, r6
add ip, ip, r6
vdup.8 d0, r4
vdup.8 d1, ip
vtrn.32 d0, d1
ldr r4, [sp, #28]
beq 4f
vext.32 d1, d0, d1, #1
add r5, r2, r3
lsl r3, r3, #1
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d4[1]}, [r5], r3
3: // vertical interpolation loop
pld [r5]
vmull.u8 q8, d4, d0
vld1.32 {d4[0]}, [r2], r3
vmull.u8 q9, d4, d1
vld1.32 {d4[1]}, [r5], r3
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
vrshrn.u16 d16, q8, #6
subs r4, r4, #2
pld [r2]
vst1.\align {d16[0]}, [r0,:\align], r1
vst1.\align {d16[st2]}, [r0,:\align], r1
bgt 3b
pop {r4-r6, pc}
4: // dy is 0
vld1.64 {d4}, [r2], r3
vld1.64 {d6}, [r2], r3
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
vtrn.32 d6, d7
5: // horizontal interpolation loop
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
subs r4, r4, #2
vld1.64 {d4}, [r2], r3
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r2]
vrshrn.u16 d16, q8, #6
vld1.64 {d6}, [r2], r3
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
pld [r2]
vst1.\align {d16[0]}, [r0,:\align], r1
vst1.\align {d16[st2]}, [r0,:\align], r1
bgt 5b
pop {r4-r6, pc}
.endm
CHROMA_MC 2, 16
CHROMA_MC 4, 32
// the optimial timing for width 8 is different enough that it's not
// readable to put it in the same macro as width 2/4
mc_chroma_w8:
CHROMA_MC_START d4-d5, d6-d7
1: // height loop, interpolate xy
pld [r5]
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
vld1.64 {d4, d5}, [r2], r3
vmlal.u8 q8, d6, d2
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0
subs r4, r4, #2
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
vrshrn.u16 d16, q8, #6
vld1.64 {d6, d7}, [r5], r3
pld [r2]
vrshrn.u16 d17, q9, #6
vext.8 d7, d6, d7, #1
vst1.64 {d16}, [r0,:64], r1
vst1.64 {d17}, [r0,:64], r1
bgt 1b
pop {r4-r6, pc}
2: // dx or dy are 0
tst r6, r6
add ip, ip, r6
vdup.8 d0, r4
vdup.8 d1, ip
ldr r4, [sp, #28]
beq 4f
add r5, r2, r3
lsl r3, r3, #1
vld1.64 {d4}, [r2], r3
vld1.64 {d6}, [r5], r3
3: // vertical interpolation loop
pld [r5]
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
vld1.64 {d4}, [r2], r3
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
vld1.64 {d6}, [r5], r3
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
subs r4, r4, #2
pld [r2]
vst1.64 {d16}, [r0,:64], r1
vst1.64 {d17}, [r0,:64], r1