Commit 54ad561d authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: filmgrain: Share the prologue of the fgy function

This is the same as what was done for the fguv function, to reduce
the amount of space used for it (and also simplifying the calling
code).

This gives no significant slowdown for the case currently benchmarked
by checkasm, while shrinking the code produced by film_grain.S by
320 bytes.
parent 90bcb331
Pipeline #88012 passed with stages
in 6 minutes and 43 seconds
......@@ -96,25 +96,19 @@ endconst
add \dst, \dst, \offx, uxtw // grain_lut += offx
.endm
// void dav1d_fgy_32x32_XY_8bpc_neon(pixel *const dst, const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const int scaling_shift,
// const entry grain_lut[][GRAIN_WIDTH],
// const int offsets[][2],
// const int h, const ptrdiff_t clip);
.macro fgy ox, oy
function fgy_32x32_\ox\oy\()_8bpc_neon, export=1
// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const int scaling_shift,
// const entry grain_lut[][GRAIN_WIDTH],
// const int offsets[][2],
// const int h, const ptrdiff_t clip,
// const ptrdiff_t type);
function fgy_32x32_8bpc_neon, export=1
str x30, [sp, #-16]!
.if \ox
ldr w11, [x6, #8] // offsets[1][0]
.endif
.if \oy
ldr w13, [x6, #4] // offsets[0][1]
.endif
.if \ox && \oy
ldr w15, [x6, #12] // offsets[1][1]
.endif
ldr w6, [x6] // offsets[0][0]
ldr w8, [sp, #16] // clip
mov x9, #GRAIN_WIDTH // grain_lut stride
......@@ -122,9 +116,7 @@ function fgy_32x32_\ox\oy\()_8bpc_neon, export=1
neg w4, w4
dup v29.8h, w4 // -scaling_shift
.if \ox || \oy
movrel x16, overlap_coeffs_0
.endif
cbz w8, 1f
// clip
......@@ -137,54 +129,48 @@ function fgy_32x32_\ox\oy\()_8bpc_neon, export=1
movi v31.16b, #255
2:
.if \ox || \oy
ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
.endif
add x5, x5, #9 // grain_lut += 9
add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
add x5, x5, x9 // grain_lut += grain_stride
.if \ox
calc_offset w11, w12, w11, 0, 0
.endif
.if \oy
calc_offset w13, w14, w13, 0, 0
.endif
.if \ox && \oy
calc_offset w15, w16, w15, 0, 0
.endif
calc_offset w6, w10, w6, 0, 0
.if \ox
add_offset x12, w11, x12, x5, x9
.endif
.if \oy
add_offset x14, w13, x14, x5, x9
.endif
.if \ox && \oy
add_offset x16, w15, x16, x5, x9
.endif
add_offset x5, w6, x10, x5, x9
.if \ox
ldr w11, [sp, #24] // type
adr x13, L(fgy_loop_tbl)
add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx
.endif
.if \oy
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
.endif
.if \ox && \oy
tst w11, #1
ldrh w11, [x13, w11, uxtw #1]
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx
.endif
.if \oy
sub x11, x13, w11, uxtw
b.eq 1f
// y overlap
dup v6.16b, v27.b[0]
dup v7.16b, v27.b[1]
mov w10, w7 // backup actual h
mov w7, #2
.endif
1:
br x11
endfunc
function fgy_loop_neon
.macro fgy ox, oy
L(loop_\ox\oy):
1:
ld1 {v0.16b, v1.16b}, [x1], x2 // src
......@@ -300,13 +286,19 @@ L(loop_\ox\oy):
.endif
ldr x30, [sp], #16
ret
endfunc
.endm
fgy 0, 0
fgy 0, 1
fgy 1, 0
fgy 1, 1
fgy 0, 0
fgy 0, 1
fgy 1, 0
fgy 1, 1
L(fgy_loop_tbl):
.hword L(fgy_loop_tbl) - L(loop_00)
.hword L(fgy_loop_tbl) - L(loop_01)
.hword L(fgy_loop_tbl) - L(loop_10)
.hword L(fgy_loop_tbl) - L(loop_11)
endfunc
// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
// const pixel *const src,
......
......@@ -41,21 +41,16 @@ CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTE
// Use ptrdiff_t instead of int for the last few parameters, to get the
// same layout of parameters on the stack across platforms.
#define FGY(suff) \
void BF(dav1d_fgy_32x32_ ## suff, neon)(pixel *const dst, \
const pixel *const src, \
const ptrdiff_t stride, \
const uint8_t scaling[SCALING_SIZE], \
const int scaling_shift, \
const entry grain_lut[][GRAIN_WIDTH], \
const int offsets[][2], \
const int h, const ptrdiff_t clip \
HIGHBD_DECL_SUFFIX)
FGY(00);
FGY(01);
FGY(10);
FGY(11);
void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
const pixel *const src,
const ptrdiff_t stride,
const uint8_t scaling[SCALING_SIZE],
const int scaling_shift,
const entry grain_lut[][GRAIN_WIDTH],
const int offsets[][2],
const int h, const ptrdiff_t clip,
const ptrdiff_t type
HIGHBD_DECL_SUFFIX);
// Use ptrdiff_t instead of int for the last few parameters, to get the
// parameters on the stack with the same layout across platforms.
......@@ -117,33 +112,17 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
for (int i = 0; i < rows; i++)
offsets[0][i] = get_random_number(8, &seed[i]);
if (data->overlap_flag && row_num) {
if (data->overlap_flag && bx)
BF(dav1d_fgy_32x32_11, neon)(dst_row + bx, src_row + bx, stride,
scaling, data->scaling_shift,
grain_lut, offsets, bh,
data->clip_to_restricted_range
HIGHBD_TAIL_SUFFIX);
else
BF(dav1d_fgy_32x32_01, neon)(dst_row + bx, src_row + bx, stride,
scaling, data->scaling_shift,
grain_lut, offsets, bh,
data->clip_to_restricted_range
HIGHBD_TAIL_SUFFIX);
} else {
if (data->overlap_flag && bx)
BF(dav1d_fgy_32x32_10, neon)(dst_row + bx, src_row + bx, stride,
scaling, data->scaling_shift,
grain_lut, offsets, bh,
data->clip_to_restricted_range
HIGHBD_TAIL_SUFFIX);
else
BF(dav1d_fgy_32x32_00, neon)(dst_row + bx, src_row + bx, stride,
scaling, data->scaling_shift,
grain_lut, offsets, bh,
data->clip_to_restricted_range
HIGHBD_TAIL_SUFFIX);
}
int type = 0;
if (data->overlap_flag && row_num)
type |= 1; /* overlap y */
if (data->overlap_flag && bx)
type |= 2; /* overlap x */
BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
scaling, data->scaling_shift,
grain_lut, offsets, bh,
data->clip_to_restricted_range, type
HIGHBD_TAIL_SUFFIX);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment