Commit 52f9719b authored by David Conrad's avatar David Conrad Committed by Fiona Glaser

GSOC merge part 3: ARM NEON pixel assembly functions

SAD, SADX3/X4, SSD, SATD, SA8D, Hadamard_AC, VAR, VAR2, SSIM
parent ca7da1ae
......@@ -58,7 +58,7 @@ endif
# NEON optims
ifeq ($(ARCH),ARM)
ifneq ($(AS),)
ASMSRC += common/arm/cpu-a.S
ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
......
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
.macro require8, val=1
.eabi_attribute 24, \val
.endm
.macro preserve8, val=1
.eabi_attribute 25, \val
.endm
.macro function name, export=0
.if \export
.global \name
.endif
.type \name, %function
.func \name
\name:
.endm
.macro movrel rd, val
#if defined(HAVE_ARMV6T2) && !defined(PIC)
movw \rd, #:lower16:\val
movt \rd, #:upper16:\val
#else
ldr \rd, =\val
#endif
.endm
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
.macro HORIZ_ADD dest, a, b
.ifnb \b
vadd.u16 \a, \a, \b
.endif
vpaddl.u16 \a, \a
vpaddl.u32 \dest, \a
.endm
.macro SUMSUB_AB sum, diff, a, b
vadd.s16 \sum, \a, \b
vsub.s16 \diff, \a, \b
.endm
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro ABS2 a b
vabs.s16 \a, \a
vabs.s16 \b, \b
.endm
// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
// op = sumsub/amax (sum and diff / maximum of absolutes)
// d1/2 = destination registers
// s1/2 = source registers
.macro HADAMARD dist, op, d1, d2, s1, s2
.if \dist == 1
vtrn.16 \s1, \s2
.else
vtrn.32 \s1, \s2
.endif
.ifc \op, sumsub
SUMSUB_AB \d1, \d2, \s1, \s2
.else
vabs.s16 \s1, \s1
vabs.s16 \s2, \s2
vmax.s16 \d1, \s1, \s2
.endif
.endm
/*****************************************************************************
* cpu-a.S: h264 encoder library
*****************************************************************************
* Copyright (C) 2009 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "asm.S"
.fpu neon
.align
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
function x264_cpu_neon_test, export=1
vadd.i16 q0, q0, q0
bx lr
.endfunc
// return: 0 on success
// 1 if counters were already enabled
// 9 if lo-res counters were already enabled
function x264_cpu_enable_armv7_counter
mrc p15, 0, r2, c9, c12, 0 // read PMNC
ands r0, r2, #1
andne r0, r2, #9
orr r2, r2, #1 // enable counters
bic r2, r2, #8 // full resolution
mcreq p15, 0, r2, c9, c12, 0 // write PMNC
mov r2, #1 << 31 // enable cycle counter
mcr p15, 0, r2, c9, c12, 1 // write CNTENS
bx lr
.endfunc
function x264_cpu_disable_armv7_counter
mrc p15, 0, r0, c9, c12, 0 // read PMNC
bic r0, r0, #1 // disable counters
mcr p15, 0, r0, c9, c12, 0 // write PMNC
bx lr
.endfunc
.macro READ_TIME r
mrc p15, 0, \r, c9, c13, 0
.endm
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
// nonzero otherwise
function x264_cpu_fast_neon_mrc_test, export=1
// check for user access to performance counters
mrc p15, 0, r0, c9, c14, 0
cmp r0, #0
bxeq lr
push {r4-r6,lr}
bl x264_cpu_enable_armv7_counter
ands r1, r0, #8
mov r3, #0
mov ip, #4
mov r6, #4
moveq r5, #1
movne r5, #64
average_loop:
mov r4, r5
READ_TIME r1
1: subs r4, r4, #1
.rept 8
vmov.u32 lr, d0[0]
add lr, lr, lr
.endr
bgt 1b
READ_TIME r2
subs r6, r6, #1
sub r2, r2, r1
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
addle r3, r3, r2
subles ip, ip, #1
bgt average_loop
// disable counters if we enabled them
ands r0, r0, #1
bleq x264_cpu_disable_armv7_counter
lsr r0, r3, #5
cmp r0, #10
movgt r0, #0
pop {r4-r6,pc}
.endfunc
/*****************************************************************************
* pixel.S: h264 encoder
*****************************************************************************
* Copyright (C) 2009 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "asm.S"
.fpu neon
.section .rodata
.align 4
.rept 16 .byte 0xff
.endr
mask_ff:
.rept 16 .byte 0
.endr
mask_ac4:
.short 0, -1, -1, -1, 0, -1, -1, -1
mask_ac8:
.short 0, -1, -1, -1, -1, -1, -1, -1
.text
.macro SAD4_ARMV6 h
function x264_pixel_sad_4x\h\()_armv6, export=1
push {r4-r6,lr}
ldr r4, [r2], r3
ldr r5, [r0], r1
ldr r6, [r2], r3
ldr lr, [r0], r1
usad8 ip, r4, r5
.rept (\h - 2)/2
ldr r4, [r2], r3
ldr r5, [r0], r1
usada8 ip, r6, lr, ip
ldr r6, [r2], r3
ldr lr, [r0], r1
usada8 ip, r4, r5, ip
.endr
usada8 r0, r6, lr, ip
pop {r4-r6,pc}
.endfunc
.endm
SAD4_ARMV6 4
SAD4_ARMV6 8
.macro SAD_START_4 align:vararg
vld1.32 {d1[]}, [r2 \align], r3
vld1.32 {d0[]}, [r0,:32], r1
vabdl.u8 q8, d0, d1
.endm
.macro SAD_4 align:vararg
vld1.32 {d1[]}, [r2 \align], r3
vld1.32 {d0[]}, [r0,:32], r1
vabal.u8 q8, d0, d1
.endm
.macro SAD_START_8 align:vararg
vld1.64 {d1}, [r2 \align], r3
vld1.64 {d0}, [r0,:64], r1
vabdl.u8 q8, d0, d1
.endm
.macro SAD_8 align:vararg
vld1.64 {d1}, [r2 \align], r3
vld1.64 {d0}, [r0,:64], r1
vabal.u8 q8, d0, d1
.endm
.macro SAD_START_16 align:vararg
vld1.64 {d2-d3}, [r2 \align], r3
vld1.64 {d0-d1}, [r0,:128], r1
vabdl.u8 q8, d0, d2
vld1.64 {d6-d7}, [r2 \align], r3
vabdl.u8 q9, d1, d3
vld1.64 {d4-d5}, [r0,:128], r1
.endm
.macro SAD_16 align:vararg
vabal.u8 q8, d4, d6
vld1.64 {d2-d3}, [r2 \align], r3
vabal.u8 q9, d5, d7
vld1.64 {d0-d1}, [r0,:128], r1
vabal.u8 q8, d0, d2
vld1.64 {d6-d7}, [r2 \align], r3
vabal.u8 q9, d1, d3
vld1.64 {d4-d5}, [r0,:128], r1
.endm
.macro SAD_FUNC w, h, name, align:vararg
function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
.if \w == 16
.set r, \h / 2 - 1
.else
.set r, \h - 1
.endif
SAD_START_\w \align
.rept r
SAD_\w \align
.endr
.if \w > 8
vabal.u8 q8, d4, d6
vabal.u8 q9, d5, d7
vadd.u16 q8, q8, q9
.endif
.if \w > 4
vadd.u16 d16, d16, d17
.endif
vpadd.u16 d0, d16, d16
vpaddl.u16 d0, d0
vmov.u32 r0, d0[0]
bx lr
.endfunc
.endm
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 8, 4
SAD_FUNC 8, 8
SAD_FUNC 8, 16
SAD_FUNC 16, 8
SAD_FUNC 16, 16
SAD_FUNC 4, 4, _aligned, ,:32
SAD_FUNC 4, 8, _aligned, ,:32
SAD_FUNC 8, 4, _aligned, ,:64
SAD_FUNC 8, 8, _aligned, ,:64
SAD_FUNC 8, 16, _aligned, ,:64
SAD_FUNC 16, 8, _aligned, ,:128
SAD_FUNC 16, 16, _aligned, ,:128
// If dual issue is possible, use additional accumulators to avoid
// stalls from vadal's latency. This only matters for aligned.
.macro SAD_DUAL_START_8
SAD_START_8 ,:64
vld1.64 {d3}, [r2,:64], r3
vld1.64 {d2}, [r0,:64], r1
vabdl.u8 q9, d2, d3
.endm
.macro SAD_DUAL_8 align:vararg
vld1.64 {d1}, [r2,:64], r3
vld1.64 {d0}, [r0,:64], r1
vabal.u8 q8, d0, d1
vld1.64 {d3}, [r2,:64], r3
vld1.64 {d2}, [r0,:64], r1
vabal.u8 q9, d2, d3
.endm
.macro SAD_DUAL_START_16
SAD_START_16 ,:128
vabdl.u8 q10, d4, d6
vld1.64 {d2-d3}, [r2,:128], r3
vabdl.u8 q11, d5, d7
vld1.64 {d0-d1}, [r0,:128], r1
.endm
.macro SAD_DUAL_16
vabal.u8 q8, d0, d2
vld1.64 {d6-d7}, [r2,:128], r3
vabal.u8 q9, d1, d3
vld1.64 {d4-d5}, [r0,:128], r1
vabal.u8 q10, d4, d6
vld1.64 {d2-d3}, [r2,:128], r3
vabal.u8 q11, d5, d7
vld1.64 {d0-d1}, [r0,:128], r1
.endm
.macro SAD_DUAL_END_16
vabal.u8 q8, d0, d2
vld1.64 {d6-d7}, [r2,:128], r3
vabal.u8 q9, d1, d3
vld1.64 {d4-d5}, [r0,:128], r1
vabal.u8 q10, d4, d6
vabal.u8 q11, d5, d7
.endm
.macro SAD_FUNC_DUAL w, h
function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1
.if \w == 16
.set r, \h / 2 - 2
.else
.set r, \h / 2 - 1
.endif
SAD_DUAL_START_\w
.rept \h / 2 - \w / 8
SAD_DUAL_\w
.endr
.if \w > 8
SAD_DUAL_END_16
vadd.u16 q8, q8, q9
vadd.u16 q9, q10, q11
.endif
.if \w > 4
vadd.u16 q8, q8, q9
vadd.u16 d16, d16, d17
.endif
vpadd.u16 d0, d16, d16
vpaddl.u16 d0, d0
vmov.u32 r0, d0[0]
bx lr
.endfunc
.endm
SAD_FUNC_DUAL 8, 4
SAD_FUNC_DUAL 8, 8
SAD_FUNC_DUAL 8, 16
SAD_FUNC_DUAL 16, 8
SAD_FUNC_DUAL 16, 16
.macro SAD_X_START_4 x
vld1.32 {d0[]}, [r0,:32], lr
vld1.32 {d1[]}, [r1], r6
vabdl.u8 q8, d1, d0
vld1.32 {d2[]}, [r2], r6
vabdl.u8 q9, d2, d0
vld1.32 {d3[]}, [r3], r6
vabdl.u8 q10, d3, d0
.if \x == 4
vld1.32 {d4[]}, [r12], r6
vabdl.u8 q11, d4, d0
.endif
.endm
.macro SAD_X_4 x
vld1.32 {d0[]}, [r0,:32], lr
vld1.32 {d1[]}, [r1], r6
vabal.u8 q8, d1, d0
vld1.32 {d2[]}, [r2], r6
vabal.u8 q9, d2, d0
vld1.32 {d3[]}, [r3], r6
vabal.u8 q10, d3, d0
.if \x == 4
vld1.32 {d4[]}, [r12], r6
vabal.u8 q11, d4, d0
.endif
.endm
.macro SAD_X_START_8 x
vld1.64 {d0}, [r0,:64], lr
vld1.64 {d1}, [r1], r6
vabdl.u8 q8, d1, d0
vld1.64 {d2}, [r2], r6
vabdl.u8 q9, d2, d0
vld1.64 {d3}, [r3], r6
vabdl.u8 q10, d3, d0
.if \x == 4
vld1.64 {d4}, [r12], r6
vabdl.u8 q11, d4, d0
.endif
.endm
.macro SAD_X_8 x
vld1.64 {d0}, [r0,:64], lr
vld1.64 {d1}, [r1], r6
vabal.u8 q8, d1, d0
vld1.64 {d2}, [r2], r6
vabal.u8 q9, d2, d0
vld1.64 {d3}, [r3], r6
vabal.u8 q10, d3, d0
.if \x == 4
vld1.64 {d4}, [r12], r6
vabal.u8 q11, d4, d0
.endif
.endm
.macro SAD_X_START_16 x
vld1.64 {d0-d1}, [r0,:128], lr
vld1.64 {d2-d3}, [r1], r6
vabdl.u8 q8, d2, d0
vabdl.u8 q12, d3, d1
vld1.64 {d4-d5}, [r2], r6
vabdl.u8 q9, d4, d0
vabdl.u8 q13, d5, d1
vld1.64 {d6-d7}, [r3], r6
vabdl.u8 q10, d6, d0
vabdl.u8 q14, d7, d1
.if \x == 4
vld1.64 {d2-d3}, [r12], r6
vabdl.u8 q11, d2, d0
vabdl.u8 q15, d3, d1
.endif
.endm
.macro SAD_X_16 x
vld1.64 {d0-d1}, [r0,:128], lr
vld1.64 {d2-d3}, [r1], r6
vabal.u8 q8, d2, d0
vabal.u8 q12, d3, d1
vld1.64 {d4-d5}, [r2], r6
vabal.u8 q9, d4, d0
vabal.u8 q13, d5, d1
vld1.64 {d6-d7}, [r3], r6
vabal.u8 q10, d6, d0
vabal.u8 q14, d7, d1
.if \x == 4
vld1.64 {d2-d3}, [r12], r6
vabal.u8 q11, d2, d0
vabal.u8 q15, d3, d1
.endif
.endm
.macro SAD_X_FUNC x, w, h
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
push {r6-r7,lr}
.if \x == 3
ldrd r6, [sp, #12]
.else
ldrd r6, [sp, #16]
ldr r12, [sp, #12]
.endif
mov lr, #FENC_STRIDE
SAD_X_START_\w \x
.rept \h - 1
SAD_X_\w \x
.endr
// add up the sads
.if \w > 8
vadd.u16 q8, q8, q12
vadd.u16 q9, q9, q13
vadd.u16 q10, q10, q14
.if \x == 4
vadd.u16 q11, q11, q15
.endif
.endif
.if \w > 4
vadd.u16 d16, d16, d17
vadd.u16 d18, d18, d19
vadd.u16 d20, d20, d21
.if \x == 4
vadd.u16 d22, d22, d23
.endif
.endif
vpadd.u16 d0, d16, d18
vpadd.u16 d1, d20, d22
vpaddl.u16 q0, q0
.if \x == 3
vst1.32 {d0}, [r7]!
vst1.32 {d1[0]}, [r7,:32]
.else
vst1.32 {d0-d1}, [r7]
.endif
pop {r6-r7,pc}
.endfunc
.endm
SAD_X_FUNC 3, 4, 4
SAD_X_FUNC 3, 4, 8
SAD_X_FUNC 3, 8, 4
SAD_X_FUNC 3, 8, 8
SAD_X_FUNC 3, 8, 16
SAD_X_FUNC 3, 16, 8
SAD_X_FUNC 3, 16, 16
SAD_X_FUNC 4, 4, 4
SAD_X_FUNC 4, 4, 8
SAD_X_FUNC 4, 8, 4
SAD_X_FUNC 4, 8, 8
SAD_X_FUNC 4, 8, 16
SAD_X_FUNC 4, 16, 8
SAD_X_FUNC 4, 16, 16
.macro SSD_START_4
vld1.32 {d16[]}, [r0,:32], r1
vld1.32 {d17[]}, [r2,:32], r3
vsubl.u8 q2, d16, d17
vld1.32 {d16[]}, [r0,:32], r1
vmull.s16 q0, d4, d4
vld1.32 {d17[]}, [r2,:32], r3
.endm
.macro SSD_4
vsubl.u8 q2, d16, d17
vld1.32 {d16[]}, [r0,:32], r1
vmlal.s16 q0, d4, d4
vld1.32 {d17[]}, [r2,:32], r3
.endm
.macro SSD_END_4
vsubl.u8 q2, d16, d17
vmlal.s16 q0, d4, d4
.endm
.macro SSD_START_8
vld1.64 {d16}, [r0,:64], r1
vld1.64 {d17}, [r2,:64], r3
vsubl.u8 q2, d16, d17
vld1.64 {d16}, [r0,:64], r1
vmull.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
vld1.64 {d17}, [r2,:64], r3
.endm
.macro SSD_8
vsubl.u8 q2, d16, d17
vld1.64 {d16}, [r0,:64], r1
vmlal.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
vld1.64 {d17}, [r2,:64], r3
.endm
.macro SSD_END_8
vsubl.u8 q2, d16, d17
vmlal.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
.endm
.macro SSD_START_16
vld1.64 {d16-d17}, [r0,:128], r1
vld1.64 {d18-d19}, [r2,:128], r3
vsubl.u8 q2, d16, d18
vsubl.u8 q3, d17, d19
vld1.64 {d16-d17}, [r0,:128], r1
vmull.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
vld1.64 {d18-d19}, [r2,:128], r3
vmlal.s16 q0, d6, d6
vmlal.s16 q0, d7, d7
.endm
.macro SSD_16
vsubl.u8 q2, d16, d18
vsubl.u8 q3, d17, d19
vld1.64 {d16-d17}, [r0,:128], r1
vmlal.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
vld1.64 {d18-d19}, [r2,:128], r3
vmlal.s16 q0, d6, d6
vmlal.s16 q0, d7, d7
.endm
.macro SSD_END_16
vsubl.u8 q2, d16, d18
vsubl.u8 q3, d17, d19
vmlal.s16 q0, d4, d4
vmlal.s16 q0, d5, d5
vmlal.s16 q0, d6, d6
vmlal.s16 q0, d7, d7
.endm
.macro SSD_FUNC w h
function x264_pixel_ssd_\w\()x\h\()_neon, export=1
SSD_START_\w
.rept \h-2
SSD_\w
.endr
SSD_END_\w
vadd.s32 d0, d0, d1
vpadd.s32 d0, d0, d0
vmov.32 r0, d0[0]
bx lr
.endfunc
.endm
SSD_FUNC 4, 4
SSD_FUNC 4, 8
SSD_FUNC 8, 4
SSD_FUNC 8, 8
SSD_FUNC 8, 16
SSD_FUNC 16, 8
SSD_FUNC 16, 16
.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
vmull.u8 \qsqr, \dsrc, \dsrc
vaddw.u8 q0, q0, \dsrc
\vpadal \qsqr_sum, \qsqr_last
.endm
function x264_pixel_var_8x8_neon, export=1
vld1.64 {d16}, [r0,:64], r1