Commit 435722c9 authored by Janne Grunau's avatar Janne Grunau Committed by Fiona Glaser

arm: x264_coeff_last8_arm

checkasm --bench on a coretex-a9:
coeff_last8_c: 173
coeff_last8_armv6: 151

60 instead of 73 cycles in ~130k runs on the same cpu while encoding.
parent 2e96c571
......@@ -27,36 +27,6 @@
#include "predict.h"
#include "pixel.h"
void x264_predict_4x4_dc_armv6( uint8_t *src );
void x264_predict_4x4_dc_top_neon( uint8_t *src );
void x264_predict_4x4_h_armv6( uint8_t *src );
void x264_predict_4x4_ddr_armv6( uint8_t *src );
void x264_predict_4x4_ddl_neon( uint8_t *src );
void x264_predict_8x8c_dc_neon( uint8_t *src );
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
void x264_predict_16x16_dc_left_neon( uint8_t *src );
void x264_predict_16x16_h_neon( uint8_t *src );
void x264_predict_16x16_v_neon( uint8_t *src );
void x264_predict_16x16_p_neon( uint8_t *src );
void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
{
if (!(cpu&X264_CPU_ARMV6))
......
......@@ -27,17 +27,35 @@
#define X264_ARM_PREDICT_H
void x264_predict_4x4_dc_armv6( uint8_t *src );
void x264_predict_4x4_dc_top_neon( uint8_t *src );
void x264_predict_4x4_v_armv6( uint8_t *src );
void x264_predict_4x4_h_armv6( uint8_t *src );
void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] );
void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] );
void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] );
void x264_predict_8x8c_dc_neon( pixel *src );
void x264_predict_8x8c_h_neon( pixel *src );
void x264_predict_8x8c_v_neon( pixel *src );
void x264_predict_16x16_v_neon( pixel *src );
void x264_predict_16x16_h_neon( pixel *src );
void x264_predict_16x16_dc_neon( pixel *src );
void x264_predict_4x4_ddr_armv6( uint8_t *src );
void x264_predict_4x4_ddl_neon( uint8_t *src );
void x264_predict_8x8c_dc_neon( uint8_t *src );
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
void x264_predict_16x16_dc_left_neon( uint8_t *src );
void x264_predict_16x16_h_neon( uint8_t *src );
void x264_predict_16x16_v_neon( uint8_t *src );
void x264_predict_16x16_p_neon( uint8_t *src );
void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
......
......@@ -321,6 +321,20 @@ function x264_coeff_last4_arm
bx lr
.endfunc
function x264_coeff_last8_arm
ldrd r2, r3, [r0, #8]
orrs ip, r2, r3
movne r0, #4
ldrdeq r2, r3, [r0]
moveq r0, #0
tst r3, r3
addne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
.endfunc
.macro COEFF_LAST_1x size
function x264_coeff_last\size\()_neon
.if \size == 15
......
......@@ -39,6 +39,7 @@ void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_coeff_last4_arm( int16_t * );
int x264_coeff_last8_arm( int16_t * );
int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
......
......@@ -725,7 +725,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#if HAVE_ARMV6
if( cpu&X264_CPU_ARMV6 )
{
pf->coeff_last4 = x264_coeff_last4_arm;
pf->coeff_last8 = x264_coeff_last8_arm;
}
if( cpu&X264_CPU_NEON )
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment