Commit 53a5772a authored by David Conrad's avatar David Conrad Committed by Fiona Glaser

Various ARM-related fixes

Fix comment for mc_copy_neon.
Fix memzero_aligned_neon prototype.
Update NEON (i)dct_dc prototypes.
Duplicate x86 behavior for global+hidden functions.
parent 30b3825e
......@@ -119,6 +119,7 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.S
$(AS) $(ASFLAGS) -o $@ $<
-@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
.depend: config.mak
rm -f .depend
......
......@@ -20,19 +20,24 @@
#include "config.h"
#ifdef __ELF__
# define ELF
#else
# define ELF @
#endif
.macro require8, val=1
.eabi_attribute 24, \val
ELF .eabi_attribute 24, \val
.endm
.macro preserve8, val=1
.eabi_attribute 25, \val
ELF .eabi_attribute 25, \val
.endm
.macro function name, export=0
.if \export
.macro function name
.global \name
.endif
.type \name, %function
ELF .hidden \name
ELF .type \name, %function
.func \name
\name:
.endm
......
......@@ -27,7 +27,7 @@
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
function x264_cpu_neon_test, export=1
function x264_cpu_neon_test
vadd.i16 q0, q0, q0
bx lr
.endfunc
......@@ -62,7 +62,7 @@ function x264_cpu_disable_armv7_counter
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
// nonzero otherwise
function x264_cpu_fast_neon_mrc_test, export=1
function x264_cpu_fast_neon_mrc_test
// check for user access to performance counters
mrc p15, 0, r0, c9, c14, 0
cmp r0, #0
......
......@@ -62,7 +62,7 @@ scan4x4_frame:
.endm
function x264_dct4x4dc_neon, export=1
function x264_dct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
......@@ -81,7 +81,7 @@ function x264_dct4x4dc_neon, export=1
bx lr
.endfunc
function x264_idct4x4dc_neon, export=1
function x264_idct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
......@@ -105,7 +105,7 @@ function x264_idct4x4dc_neon, export=1
vsub.s16 \d3, \d7, \d5
.endm
function x264_sub4x4_dct_neon, export=1
function x264_sub4x4_dct_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.32 {d0[]}, [r1,:32], r3
......@@ -128,7 +128,7 @@ function x264_sub4x4_dct_neon, export=1
bx lr
.endfunc
function x264_sub8x4_dct_neon, export=1
function x264_sub8x4_dct_neon
vld1.64 {d0}, [r1,:64], r3
vld1.64 {d1}, [r2,:64], ip
vsubl.u8 q8, d0, d1
......@@ -164,7 +164,7 @@ function x264_sub8x4_dct_neon, export=1
bx lr
.endfunc
function x264_sub8x8_dct_neon, export=1
function x264_sub8x8_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
......@@ -173,7 +173,7 @@ function x264_sub8x8_dct_neon, export=1
b x264_sub8x4_dct_neon
.endfunc
function x264_sub16x16_dct_neon, export=1
function x264_sub16x16_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
......@@ -226,7 +226,7 @@ function x264_sub16x16_dct_neon, export=1
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
.endm
function x264_sub8x8_dct8_neon, export=1
function x264_sub8x8_dct8_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
......@@ -278,7 +278,7 @@ function x264_sub8x8_dct8_neon, export=1
bx lr
.endfunc
function x264_sub16x16_dct8_neon, export=1
function x264_sub16x16_dct8_neon
push {lr}
bl x264_sub8x8_dct8_neon
sub r1, r1, #FENC_STRIDE*8 - 8
......@@ -303,7 +303,7 @@ function x264_sub16x16_dct8_neon, export=1
vadd.s16 \d6, \d6, \d1
.endm
function x264_add4x4_idct_neon, export=1
function x264_add4x4_idct_neon
mov r2, #FDEC_STRIDE
vld1.64 {d0-d3}, [r1,:128]
......@@ -335,7 +335,7 @@ function x264_add4x4_idct_neon, export=1
bx lr
.endfunc
function x264_add8x4_idct_neon, export=1
function x264_add8x4_idct_neon
vld1.64 {d0-d3}, [r1,:128]!
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
vld1.64 {d4-d7}, [r1,:128]!
......@@ -375,7 +375,7 @@ function x264_add8x4_idct_neon, export=1
bx lr
.endfunc
function x264_add8x8_idct_neon, export=1
function x264_add8x8_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl x264_add8x4_idct_neon
......@@ -383,7 +383,7 @@ function x264_add8x8_idct_neon, export=1
b x264_add8x4_idct_neon
.endfunc
function x264_add16x16_idct_neon, export=1
function x264_add16x16_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl x264_add8x4_idct_neon
......@@ -435,7 +435,7 @@ function x264_add16x16_idct_neon, export=1
SUMSUB_AB q11, q12, q2, q12
.endm
function x264_add8x8_idct8_neon, export=1
function x264_add8x8_idct8_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16-d19}, [r1,:128]!
vld1.64 {d20-d23}, [r1,:128]!
......@@ -497,7 +497,7 @@ function x264_add8x8_idct8_neon, export=1
bx lr
.endfunc
function x264_add16x16_idct8_neon, export=1
function x264_add16x16_idct8_neon
mov ip, lr
bl x264_add8x8_idct8_neon
sub r0, r0, #8*FDEC_STRIDE-8
......@@ -510,7 +510,7 @@ function x264_add16x16_idct8_neon, export=1
.endfunc
function x264_add8x8_idct_dc_neon, export=1
function x264_add8x8_idct_dc_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64]
vrshr.s16 d16, d16, #6
......@@ -593,7 +593,7 @@ function x264_add8x8_idct_dc_neon, export=1
vst1.64 {d22-d23}, [r2,:128], r3
.endm
function x264_add16x16_idct_dc_neon, export=1
function x264_add16x16_idct_dc_neon
mov r2, r0
mov r3, #FDEC_STRIDE
vmov.i16 q15, #0
......@@ -609,7 +609,7 @@ function x264_add16x16_idct_dc_neon, export=1
bx lr
.endfunc
function x264_sub8x8_dct_dc_neon, export=1
function x264_sub8x8_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
......@@ -650,7 +650,7 @@ function x264_sub8x8_dct_dc_neon, export=1
.endfunc
function x264_zigzag_scan_4x4_frame_neon, export=1
function x264_zigzag_scan_4x4_frame_neon
movrel r2, scan4x4_frame
vld1.64 {d0-d3}, [r1,:128]
vld1.64 {d16-d19}, [r2,:128]
......
......@@ -34,9 +34,9 @@ void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
......
......@@ -115,7 +115,7 @@
vqmovun.s16 d1, q12
.endm
function x264_deblock_v_luma_neon, export=1
function x264_deblock_v_luma_neon
h264_loop_filter_start
vld1.64 {d0, d1}, [r0,:128], r1
......@@ -141,7 +141,7 @@ function x264_deblock_v_luma_neon, export=1
bx lr
.endfunc
function x264_deblock_h_luma_neon, export=1
function x264_deblock_h_luma_neon
h264_loop_filter_start
sub r0, r0, #4
......@@ -226,7 +226,7 @@ function x264_deblock_h_luma_neon, export=1
vqmovun.s16 d0, q11
.endm
function x264_deblock_v_chroma_neon, export=1
function x264_deblock_v_chroma_neon
h264_loop_filter_start
sub r0, r0, r1, lsl #1
......@@ -244,7 +244,7 @@ function x264_deblock_v_chroma_neon, export=1
bx lr
.endfunc
function x264_deblock_h_chroma_neon, export=1
function x264_deblock_h_chroma_neon
h264_loop_filter_start
sub r0, r0, #2
......
......@@ -30,7 +30,7 @@
// They also use nothing above armv5te, but we don't care about pre-armv6
// void prefetch_ref( uint8_t *pix, int stride, int parity )
function x264_prefetch_ref_arm, export=1
function x264_prefetch_ref_arm
sub r2, r2, #1
add r0, r0, #64
and r2, r2, r1
......@@ -50,7 +50,7 @@ function x264_prefetch_ref_arm, export=1
// void prefetch_fenc( uint8_t *pix_y, int stride_y,
// uint8_t *pix_uv, int stride_uv, int mb_x )
function x264_prefetch_fenc_arm, export=1
function x264_prefetch_fenc_arm
ldr ip, [sp]
push {lr}
and lr, ip, #3
......@@ -76,7 +76,7 @@ function x264_prefetch_fenc_arm, export=1
// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
function x264_memcpy_aligned_neon, export=1
function x264_memcpy_aligned_neon
orr r3, r0, r1, lsr #1
movrel ip, memcpy_table
and r3, r3, #0xc
......@@ -138,7 +138,7 @@ memcpy_table:
.ltorg
// void x264_memzero_aligned( void *dst, size_t n )
function x264_memzero_aligned_neon, export=1
function x264_memzero_aligned_neon
vmov.i8 q0, #0
vmov.i8 q1, #0
memzero_loop:
......@@ -155,7 +155,7 @@ memzero_loop:
// uint8_t *src1, int src1_stride,
// uint8_t *src2, int src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon, export=1
function x264_pixel_avg_\w\()x\h\()_neon
ldr ip, [sp, #8]
push {r4-r6,lr}
cmp ip, #32
......@@ -230,7 +230,7 @@ AVGH 16, 16
.endm
.macro AVG_WEIGHT ext
function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
function x264_pixel_avg_weight_w4_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #2
......@@ -246,7 +246,7 @@ function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
function x264_pixel_avg_weight_w8_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #4
......@@ -270,7 +270,7 @@ function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_weight_w16_\ext\()_neon, export=1
function x264_pixel_avg_weight_w16_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #2
......@@ -295,7 +295,7 @@ AVG_WEIGHT add_add
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add
function x264_pixel_avg_w4_neon, export=1
function x264_pixel_avg_w4_neon
subs lr, lr, #2
vld1.32 {d0[]}, [r2], r3
vld1.32 {d2[]}, [r4], r5
......@@ -309,7 +309,7 @@ function x264_pixel_avg_w4_neon, export=1
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_w8_neon, export=1
function x264_pixel_avg_w8_neon
subs lr, lr, #4
vld1.64 {d0}, [r2], r3
vld1.64 {d2}, [r4], r5
......@@ -331,7 +331,7 @@ function x264_pixel_avg_w8_neon, export=1
pop {r4-r6,pc}
.endfunc
function x264_pixel_avg_w16_neon, export=1
function x264_pixel_avg_w16_neon
subs lr, lr, #4
vld1.64 {d0-d1}, [r2], r3
vld1.64 {d2-d3}, [r4], r5
......@@ -354,7 +354,7 @@ function x264_pixel_avg_w16_neon, export=1
.endfunc
function x264_pixel_avg2_w4_neon, export=1
function x264_pixel_avg2_w4_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
......@@ -372,7 +372,7 @@ avg2_w4_loop:
pop {pc}
.endfunc
function x264_pixel_avg2_w8_neon, export=1
function x264_pixel_avg2_w8_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
......@@ -390,7 +390,7 @@ avg2_w8_loop:
pop {pc}
.endfunc
function x264_pixel_avg2_w16_neon, export=1
function x264_pixel_avg2_w16_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
......@@ -408,7 +408,7 @@ avg2_w16_loop:
pop {pc}
.endfunc
function x264_pixel_avg2_w20_neon, export=1
function x264_pixel_avg2_w20_neon
ldr ip, [sp, #4]
push {lr}
sub r1, r1, #16
......@@ -432,8 +432,8 @@ avg2_w20_loop:
.endfunc
// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height )
function x264_mc_copy_w4_neon, export=1
// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
function x264_mc_copy_w4_neon
ldr ip, [sp]
copy_w4_loop:
subs ip, ip, #4
......@@ -449,7 +449,7 @@ copy_w4_loop:
bx lr
.endfunc
function x264_mc_copy_w8_neon, export=1
function x264_mc_copy_w8_neon
ldr ip, [sp]
copy_w8_loop:
subs ip, ip, #4
......@@ -465,7 +465,7 @@ copy_w8_loop:
bx lr
.endfunc
function x264_mc_copy_w16_neon, export=1
function x264_mc_copy_w16_neon
ldr ip, [sp]
copy_w16_loop:
subs ip, ip, #4
......@@ -481,7 +481,7 @@ copy_w16_loop:
bx lr
.endfunc
function x264_mc_copy_w16_aligned_neon, export=1
function x264_mc_copy_w16_aligned_neon
ldr ip, [sp]
copy_w16_aligned_loop:
subs ip, ip, #4
......@@ -501,7 +501,7 @@ copy_w16_aligned_loop:
// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
// uint8_t *src, int i_src_stride,
// int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon, export=1
function x264_mc_chroma_neon
push {r4-r6, lr}
ldrd r4, [sp, #16]
ldr r6, [sp, #24]
......@@ -741,7 +741,7 @@ mc_chroma_w8:
// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
function x264_hpel_filter_v_neon, export=1
function x264_hpel_filter_v_neon
ldr ip, [sp]
sub r1, r1, r3, lsl #1
push {lr}
......@@ -781,7 +781,7 @@ filter_v_loop:
.endfunc
// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
function x264_hpel_filter_c_neon, export=1
function x264_hpel_filter_c_neon
sub r1, #16
vld1.64 {d0-d3}, [r1,:128]!
......@@ -866,7 +866,7 @@ filter_c_loop:
.endfunc
// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
function x264_hpel_filter_h_neon, export=1
function x264_hpel_filter_h_neon
sub r1, #16
vmov.u8 d30, #5
vld1.64 {d0-d3}, [r1,:128]!
......@@ -956,7 +956,7 @@ filter_h_loop:
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
// uint8_t *dstc, int src_stride, int dst_stride, int width,
// int height )
function x264_frame_init_lowres_core_neon, export=1
function x264_frame_init_lowres_core_neon
push {r4-r10,lr}
vpush {d8-d15}
ldrd r4, [sp, #96]
......
......@@ -27,7 +27,7 @@ void x264_prefetch_ref_arm( uint8_t *, int, int );
void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
void x264_memzero_aligned_neon( void *dst, size_t n );
void x264_memzero_aligned_neon( void *dst, int n );
void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
......
......@@ -40,7 +40,7 @@ mask_ac8:
.text
.macro SAD4_ARMV6 h
function x264_pixel_sad_4x\h\()_armv6, export=1
function x264_pixel_sad_4x\h\()_armv6
push {r4-r6,lr}
ldr r4, [r2], r3
ldr r5, [r0], r1
......@@ -109,7 +109,7 @@ SAD4_ARMV6 8
.endm
.macro SAD_FUNC w, h, name, align:vararg
function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
function x264_pixel_sad\name\()_\w\()x\h\()_neon
.if \w == 16
.set r, \h / 2 - 1
.else
......@@ -199,7 +199,7 @@ SAD_FUNC 16, 16, _aligned, ,:128
.endm
.macro SAD_FUNC_DUAL w, h
function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1
function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
SAD_DUAL_START_\w
.rept \h / 2 - \w / 8
SAD_DUAL_\w
......@@ -321,7 +321,7 @@ SAD_FUNC_DUAL 16, 16
.endm
.macro SAD_X_FUNC x, w, h
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
push {r6-r7,lr}
.if \x == 3
ldrd r6, [sp, #12]
......@@ -463,7 +463,7 @@ SAD_X_FUNC 4, 16, 16
.endm
.macro SSD_FUNC w h
function x264_pixel_ssd_\w\()x\h\()_neon, export=1
function x264_pixel_ssd_\w\()x\h\()_neon
SSD_START_\w
.rept \h-2
SSD_\w
......@@ -491,7 +491,7 @@ SSD_FUNC 16, 16
\vpadal \qsqr_sum, \qsqr_last
.endm
function x264_pixel_var_8x8_neon, export=1
function x264_pixel_var_8x8_neon
vld1.64 {d16}, [r0,:64], r1
vmull.u8 q1, d16, d16
vmovl.u8 q0, d16
......@@ -517,7 +517,7 @@ function x264_pixel_var_8x8_neon, export=1
b x264_var_end
.endfunc
function x264_pixel_var_16x16_neon, export=1
function x264_pixel_var_16x16_neon
vld1.64 {d16-d17}, [r0,:128], r1
vmull.u8 q12, d16, d16
vmovl.u8 q0, d16
......@@ -573,7 +573,7 @@ function x264_var_end
vmlal.s16 \acc, \d1, \d1
.endm
function x264_pixel_var2_8x8_neon, export=1
function x264_pixel_var2_8x8_neon
DIFF_SUM q0, d0, d1
DIFF_SUM q8, d16, d17
SQR_ACC q1, d0, d1, vmull.s16
......@@ -620,7 +620,7 @@ function x264_pixel_var2_8x8_neon, export=1
vsubl.u8 \q3, d6, d7
.endm
function x264_pixel_satd_4x4_neon, export=1
function x264_pixel_satd_4x4_neon
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d3[]}, [r2], r3
......@@ -642,7 +642,7 @@ function x264_pixel_satd_4x4_neon, export=1
bx lr
.endfunc
function x264_pixel_satd_4x8_neon, export=1
function x264_pixel_satd_4x8_neon
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d3[]}, [r2], r3
......@@ -669,7 +669,7 @@ function x264_pixel_satd_4x8_neon, export=1
b x264_satd_4x8_8x4_end_neon
.endfunc
function x264_pixel_satd_8x4_neon, export=1
function x264_pixel_satd_8x4_neon
vld1.64 {d1}, [r2], r3
vld1.64 {d0}, [r0,:64], r1
vsubl.u8 q0, d0, d1
......@@ -713,7 +713,7 @@ function x264_satd_4x8_8x4_end_neon
bx lr
.endfunc
function x264_pixel_satd_8x8_neon, export=1
function x264_pixel_satd_8x8_neon
mov ip, lr
bl x264_satd_8x8_neon
......@@ -727,7 +727,7 @@ function x264_pixel_satd_8x8_neon, export=1
bx lr
.endfunc
function x264_pixel_satd_8x16_neon, export=1
function x264_pixel_satd_8x16_neon
vpush {d8-d11}
mov ip, lr
......@@ -798,7 +798,7 @@ function x264_satd_8x4v_8x8h_neon
bx lr
.endfunc
function x264_pixel_satd_16x8_neon, export=1
function x264_pixel_satd_16x8_neon
vpush {d8-d11}
mov ip, lr
......@@ -820,7 +820,7 @@ function x264_pixel_satd_16x8_neon, export=1
bx lr
.endfunc
function x264_pixel_satd_16x16_neon, export=1
function x264_pixel_satd_16x16_neon
vpush {d8-d11}
mov ip, lr
......@@ -879,7 +879,7 @@ function x264_satd_16x4_neon
.endfunc
function x264_pixel_sa8d_8x8_neon, export=1
function x264_pixel_sa8d_8x8_neon
mov ip, lr
bl x264_sa8d_8x8_neon
vadd.u16 q0, q8, q9
......@@ -891,7 +891,7 @@ function x264_pixel_sa8d_8x8_neon, export=1
bx lr
.endfunc
function x264_pixel_sa8d_16x16_neon, export=1
function x264_pixel_sa8d_16x16_neon
vpush {d8-d11}
mov ip, lr
bl x264_sa8d_8x8_neon
......@@ -988,7 +988,7 @@ function x264_sa8d_8x8_neon
.macro HADAMARD_AC w h
function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
function x264_pixel_hadamard_ac_\w\()x\h\()_neon
vpush {d8-d15}
movrel ip, mask_ac4
vmov.i8 q4, #0
......@@ -1143,7 +1143,7 @@ function x264_hadamard_ac_8x8_neon
vmull.u8 \ssb, \db, \db
.endm
function x264_pixel_ssim_4x4x2_core_neon, export=1
function x264_pixel_ssim_4x4x2_core_neon
ldr ip, [sp]
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r2], r3
......@@ -1172,7 +1172,7 @@ function x264_pixel_ssim_4x4x2_core_neon, export=1
.endfunc
// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
function x264_pixel_ssim_end4_neon, export=1
function x264_pixel_ssim_end4_neon
vld1.32 {d16-d19}, [r0,:128]!
vld1.32 {d20-d23}, [r1,:128]!
vadd.s32 q0, q8, q10
......
......@@ -32,7 +32,7 @@ pw_76543210: .short 7,6,5,4,3,2,1,0
.text
// because gcc doesn't believe in using the free shift in add
function x264_predict_4x4_h_armv6, export=1
function x264_predict_4x4_h_armv6
ldrb r1, [r0, #0*FDEC_STRIDE-1]
ldrb r2, [r0, #1*FDEC_STRIDE-1]
ldrb r3, [r0, #2*FDEC_STRIDE-1]
......@@ -52,7 +52,7 @@ function x264_predict_4x4_h_armv6, export=1
bx lr
.endfunc
function x264_predict_4x4_dc_armv6, export=1
function x264_predict_4x4_dc_armv6
mov ip, #0
ldr r1, [r0, #-FDEC_STRIDE]
ldrb r2, [r0, #0*FDEC_STRIDE-1]
......@@ -89,7 +89,7 @@ function x264_predict_4x4_dc_armv6, export=1
uadd8 \a2, \a2, \c2
.endm
function x264_predict_4x4_ddr_armv6, export=1
function x264_predict_4x4_ddr_armv6
ldr r1, [r0, # -FDEC_STRIDE]
ldrb r2, [r0, # -FDEC_STRIDE-1]
ldrb r3, [r0, #0*FDEC_STRIDE-1]
......@@ -118,7 +118,7 @@ function x264_predict_4x4_ddr_armv6, export=1
pop {r4-r6,pc}
.endfunc
function x264_predict_4x4_ddl_neon, export=1
function x264_predict_4x4_ddl_neon
sub r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0], ip
......@@ -137,7 +137,7 @@ function x264_predict_4x4_ddl_neon, export=1
bx lr
.endfunc
function x264_predict_8x8_dc_neon, export=1
function x264_predict_8x8_dc_neon
mov ip, #0
ldrd r2, [r1, #8]
push {r4-r5,lr}
......@@ -162,7 +162,7 @@ function x264_predict_8x8_dc_neon, export=1
.endfunc
function x264_predict_8x8_h_neon, export=1
function x264_predict_8x8_h_neon
add r1, r1, #7
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1]
......@@ -185,7 +185,7 @@ function x264_predict_8x8_h_neon, export=1
bx lr
.endfunc
function x264_predict_8x8c_h_neon, export=1
function x264_predict_8x8c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 4
......@@ -197,7 +197,7 @@ function x264_predict_8x8c_h_neon, export=1
bx lr
.endfunc
function x264_predict_8x8c_v_neon, export=1
function x264_predict_8x8c_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0,:64], ip
......@@ -208,7 +208,7 @@ function x264_predict_8x8c_v_neon, export=1
.endfunc
function x264_predict_16x16_dc_neon, export=1
function x264_predict_16x16_dc_neon
sub r3, r0, #FDEC_STRIDE
sub r0, r0, #1
vld1.64 {d0-d1}, [r3,:128]
......@@ -245,7 +245,7 @@ function x264_predict_16x16_dc_neon, export=1
bx lr
.endfunc
function x264_predict_16x16_h_neon, export=1
function x264_predict_16x16_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
......@@ -259,7 +259,7 @@ function x264_predict_16x16_h_neon, export=1