Commit 6c163249 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: NEON asm for missing x264_zigzag_* functions

zigzag_scan_4x4_field_neon, zigzag_sub_4x4_field_neon,
zigzag_sub_4x4ac_field_neon, zigzag_sub_4x4_frame_neon,
igzag_sub_4x4ac_frame_neon more than 2 times faster

zigzag_scan_8x8_frame_neon, zigzag_scan_8x8_field_neon,
zigzag_sub_8x8_field_neon, zigzag_sub_8x8_frame_neon 4-5 times faster

zigzag_interleave_8x8_cavlc_neon 6 times faster
parent d040d285
/****************************************************************************
* dct-a.S: AArch6464 transform and zigzag
* dct-a.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -32,6 +33,25 @@ const scan4x4_frame, align=4
.byte 26,27, 28,29, 22,23, 30,31
endconst
const scan4x4_field, align=4
.byte 0,1, 2,3, 8,9, 4,5
.byte 6,7, 10,11, 12,13, 14,15
endconst
const sub4x4_frame, align=4
.byte 0, 1, 4, 8
.byte 5, 2, 3, 6
.byte 9, 12, 13, 10
.byte 7, 11, 14, 15
endconst
const sub4x4_field, align=4
.byte 0, 4, 1, 8
.byte 12, 5, 9, 13
.byte 2, 6, 10, 14
.byte 3, 7, 11, 15
endconst
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
sshr \t0, \b, #\shift
......@@ -655,6 +675,35 @@ function x264_sub8x8_dct_dc_neon, export=1
ret
endfunc
function x264_zigzag_interleave_8x8_cavlc_neon, export=1
mov x3, #7
movi v31.4s, #1
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
umax v16.8h, v0.8h, v4.8h
umax v17.8h, v1.8h, v5.8h
umax v18.8h, v2.8h, v6.8h
umax v19.8h, v3.8h, v7.8h
st1 {v0.8h}, [x0], #16
st1 {v4.8h}, [x0], #16
umaxp v16.8h, v16.8h, v17.8h
umaxp v18.8h, v18.8h, v19.8h
st1 {v1.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
umaxp v16.8h, v16.8h, v18.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
cmhi v16.4s, v16.4s, v31.4s
st1 {v3.8h}, [x0], #16
and v16.16b, v16.16b, v31.16b
st1 {v7.8h}, [x0], #16
st1 {v16.b}[0], [x2], #1
st1 {v16.b}[4], [x2], x3
st1 {v16.b}[8], [x2], #1
st1 {v16.b}[12], [x2]
ret
endfunc
function x264_zigzag_scan_4x4_frame_neon, export=1
movrel x2, scan4x4_frame
ld1 {v0.16b,v1.16b}, [x1]
......@@ -664,3 +713,282 @@ function x264_zigzag_scan_4x4_frame_neon, export=1
st1 {v2.16b,v3.16b}, [x0]
ret
endfunc
.macro zigzag_sub_4x4 f ac
function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
mov x9, #FENC_STRIDE
mov x4, #FDEC_STRIDE
movrel x5, sub4x4_\f
mov x6, x2
ld1 {v0.s}[0], [x1], x9
ld1 {v0.s}[1], [x1], x9
ld1 {v0.s}[2], [x1], x9
ld1 {v0.s}[3], [x1], x9
ld1 {v16.16b}, [x5]
ld1 {v1.s}[0], [x2], x4
ld1 {v1.s}[1], [x2], x4
ld1 {v1.s}[2], [x2], x4
ld1 {v1.s}[3], [x2], x4
tbl v2.16b, {v0.16b}, v16.16b
tbl v3.16b, {v1.16b}, v16.16b
st1 {v0.s}[0], [x6], x4
usubl v4.8h, v2.8b, v3.8b
.ifc \ac, ac
dup h7, v4.h[0]
ins v4.h[0], wzr
fmov w5, s7
strh w5, [x3]
.endif
usubl2 v5.8h, v2.16b, v3.16b
st1 {v0.s}[1], [x6], x4
umax v6.8h, v4.8h, v5.8h
umaxv h6, v6.8h
st1 {v0.s}[2], [x6], x4
fmov w7, s6
st1 {v0.s}[3], [x6], x4
cmp w7, #0
st1 {v4.8h,v5.8h}, [x0]
cset w0, ne
ret
endfunc
.endm
zigzag_sub_4x4 field
zigzag_sub_4x4 field, ac
zigzag_sub_4x4 frame
zigzag_sub_4x4 frame, ac
function x264_zigzag_scan_4x4_field_neon, export=1
movrel x2, scan4x4_field
ld1 {v0.8h,v1.8h}, [x1]
ld1 {v16.16b}, [x2]
tbl v0.16b, {v0.16b}, v16.16b
st1 {v0.8h,v1.8h}, [x0]
ret
endfunc
function x264_zigzag_scan_8x8_frame_neon, export=1
movrel x2, scan8x8_frame
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b,v23.16b}, [x2], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
mov v25.h[6], v4.h[0]
mov v25.h[7], v5.h[0]
mov v26.h[0], v4.h[1]
mov v27.h[4], v7.h[0]
mov v28.h[7], v4.h[4]
mov v29.h[7], v3.h[6]
mov v30.h[0], v2.h[7]
mov v30.h[1], v3.h[7]
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
#define Z(z) 2*(z), 2*(z)+1
#define T(x,y) Z(x*8+y)
const scan8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
#undef T
#define T(x,y) Z((x-0)*8+y)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
function x264_zigzag_scan_8x8_field_neon, export=1
movrel x2, scan8x8_field
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b}, [x2]
ext v31.16b, v7.16b, v7.16b, #4
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
ext v31.16b, v6.16b, v31.16b, #12
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
.macro zigzag_sub8x8 f
function x264_zigzag_sub_8x8_\f\()_neon, export=1
movrel x4, sub8x8_\f
mov x5, #FENC_STRIDE
mov x6, #FDEC_STRIDE
mov x7, x2
ld1 {v0.d}[0], [x1], x5
ld1 {v0.d}[1], [x1], x5
ld1 {v1.d}[0], [x1], x5
ld1 {v1.d}[1], [x1], x5
ld1 {v2.d}[0], [x1], x5
ld1 {v2.d}[1], [x1], x5
ld1 {v3.d}[0], [x1], x5
ld1 {v3.d}[1], [x1]
ld1 {v4.d}[0], [x2], x6
ld1 {v4.d}[1], [x2], x6
ld1 {v5.d}[0], [x2], x6
ld1 {v5.d}[1], [x2], x6
ld1 {v6.d}[0], [x2], x6
ld1 {v6.d}[1], [x2], x6
ld1 {v7.d}[0], [x2], x6
ld1 {v7.d}[1], [x2]
ld1 {v16.16b,v17.16b}, [x4], #32
ld1 {v18.16b,v19.16b}, [x4], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
usubl v4.8h, v24.8b, v28.8b
usubl2 v5.8h, v24.16b, v28.16b
usubl v6.8h, v25.8b, v29.8b
usubl2 v7.8h, v25.16b, v29.16b
usubl v16.8h, v26.8b, v30.8b
usubl2 v17.8h, v26.16b, v30.16b
usubl v18.8h, v27.8b, v31.8b
usubl2 v19.8h, v27.16b, v31.16b
umax v20.8h, v4.8h, v5.8h
umax v21.8h, v6.8h, v7.8h
umax v22.8h, v16.8h, v17.8h
umax v23.8h, v18.8h, v19.8h
umax v20.8h, v20.8h, v21.8h
umax v21.8h, v22.8h, v23.8h
umax v20.8h, v20.8h, v21.8h
umaxv h22, v20.8h
st1 {v0.d}[0], [x7], x6
st1 {v0.d}[1], [x7], x6
st1 {v1.d}[0], [x7], x6
st1 {v1.d}[1], [x7], x6
st1 {v2.d}[0], [x7], x6
st1 {v2.d}[1], [x7], x6
st1 {v3.d}[0], [x7], x6
st1 {v3.d}[1], [x7]
st1 {v4.8h,v5.8h}, [x0], #32
st1 {v6.8h,v7.8h}, [x0], #32
st1 {v16.8h,v17.8h}, [x0], #32
st1 {v18.8h,v19.8h}, [x0]
fmov w9, s22
cmp w9, #0
cset w0, ne
ret
endfunc
.endm
zigzag_sub8x8 field
zigzag_sub8x8 frame
#undef T
#define T(x,y) Z(x*8+y)
const scan8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
#undef T
#define T(x,y) Z((x-1)*8+y)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
#undef T
#define T(x,y) Z((x-2)*8+y)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
#undef T
#define T(x,y) Z((x-5)*8+y)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
endconst
#undef T
#define T(y,x) x*8+y
const sub8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
const sub8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
.byte T(6,6), T(6,7), T(7,2), T(7,3)
.byte T(7,4), T(7,5), T(7,6), T(7,7)
endconst
/*****************************************************************************
* dct.h: AArch64 transform and zigzag
* dct.h: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -48,5 +49,18 @@ void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
......@@ -1004,7 +1004,20 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
#endif
#if HAVE_ARMV6 || ARCH_AARCH64
if( cpu&X264_CPU_NEON )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
{
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
#if ARCH_AARCH64
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
#endif // ARCH_AARCH64
}
#endif // HAVE_ARMV6 || ARCH_AARCH64
#endif // HIGH_BIT_DEPTH
......@@ -1047,4 +1060,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
}
#endif // HIGH_BIT_DEPTH
#endif
#if !HIGH_BIT_DEPTH
#if ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
}
#endif // ARCH_AARCH64
#endif // !HIGH_BIT_DEPTH
}
......@@ -835,18 +835,18 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
}
}
#endif // HAVE_ALTIVEC
#if HAVE_ARMV6 || ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
if( cpu&X264_CPU_NEON )
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
pf->deblock_strength = x264_deblock_strength_neon;
}
}
#endif
#endif // !HIGH_BIT_DEPTH
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment