Commit 71ed44c7 authored by Vittorio Giovara's avatar Vittorio Giovara Committed by Anton Mitrofanov

Unify 8-bit and 10-bit CLI and libraries

Add 'i_bitdepth' to x264_param_t with the corresponding '--output-depth' CLI
option to set the bit depth at runtime.

Drop the 'x264_bit_depth' global variable. Rather than hardcoding it to an
incorrect value, it's preferable to induce a linking failure. If applications
relies on this symbol this will make it more obvious where the problem is.

Add Makefile rules that compiles modules with different bit depths. Assembly
on x86 is prefixed with the 'private_prefix' define, while all other archs
modify their function prefix internally.

Templatize the main C library, x86/x86_64 assembly, ARM assembly, AARCH64
assembly, PowerPC assembly, and MIPS assembly.

The depth and cache CLI filters heavily depend on bit depth size, so they
need to be duplicated for each value. This means having to rename these
filters, and adjust the callers to use the right version.

Unfortunately the threaded input CLI module inherits a common.h dependency
(input/frame -> common/threadpool -> common/frame -> common/common) which
is extremely complicated to address in a sensible way. Instead duplicate
the module and select the appropriate one at run time.

Each bitdepth needs different checkasm compilation rules, so split the main
checkasm target into two executables.
parent 2451a728
......@@ -13,30 +13,43 @@ GENERATED =
all: default
default:
SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/osdep.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
common/mvpred.c common/bitstream.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
SRCS = common/osdep.c common/base.c common/cpu.c common/tables.c \
encoder/api.c
SRCS_X = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cabac.c \
common/common.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
common/mvpred.c common/bitstream.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
SRCS_8 =
SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
output/raw.c output/matroska.c output/matroska_ebml.c \
output/flv.c output/flv_bytestream.c filters/filters.c \
filters/video/video.c filters/video/source.c filters/video/internal.c \
filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
filters/video/select_every.c filters/video/crop.c filters/video/depth.c
filters/video/resize.c filters/video/fix_vfr_pts.c \
filters/video/select_every.c filters/video/crop.c
SRCCLI_X = filters/video/cache.c filters/video/depth.c
SRCSO =
SRCCHK_X = tools/checkasm.c
SRCEXAMPLE = example.c
OBJS =
OBJASM =
OBJSO =
OBJCLI =
OBJCHK = tools/checkasm.o
OBJEXAMPLE = example.o
OBJCHK =
OBJCHK_8 =
OBJCHK_10 =
OBJEXAMPLE =
CONFIG := $(shell cat config.h)
......@@ -51,8 +64,8 @@ SRCCLI += input/avs.c
endif
ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
SRCCLI += input/thread.c
SRCS += common/threadpool.c
SRCS_X += common/threadpool.c
SRCCLI_X += input/thread.c
endif
ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
......@@ -75,85 +88,118 @@ ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),)
SRCCLI += output/mp4_lsmash.c
endif
# MMX/SSE optims
ifneq ($(AS),)
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm \
cpu-a.asm dct-32.asm bitstream-a.asm
ifneq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
X86SRC0 += sad16-a.asm
else
X86SRC0 += sad-a.asm
endif
X86SRC = $(X86SRC0:%=common/x86/%)
# MMX/SSE optims
SRCASM_X =
ifeq ($(SYS_ARCH),X86)
ARCH_X86 = yes
ASMSRC = $(X86SRC) common/x86/pixel-32.asm
SRCASM_X += common/x86/dct-32.asm \
common/x86/pixel-32.asm
endif
ifeq ($(SYS_ARCH),X86_64)
ARCH_X86 = yes
ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
SRCASM_X += common/x86/dct-64.asm \
common/x86/trellis-64.asm
endif
ifdef ARCH_X86
SRCS += common/x86/mc-c.c common/x86/predict-c.c
OBJASM = $(ASMSRC:%.asm=%.o)
$(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
OBJCHK += tools/checkasm-a.o
SRCASM_X += common/x86/bitstream-a.asm \
common/x86/const-a.asm \
common/x86/cabac-a.asm \
common/x86/dct-a.asm \
common/x86/deblock-a.asm \
common/x86/mc-a.asm \
common/x86/mc-a2.asm \
common/x86/pixel-a.asm \
common/x86/predict-a.asm \
common/x86/quant-a.asm
SRCS_X += common/x86/mc-c.c \
common/x86/predict-c.c
OBJASM += common/x86/cpu-a.o
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.asm=%-8.o) common/x86/sad-a-8.o
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.asm=%-10.o) common/x86/sad16-a-10.o
endif
OBJCHK += tools/checkasm-a.o
endif
# AltiVec optims
ifeq ($(SYS_ARCH),PPC)
ifneq ($(AS),)
SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
common/ppc/quant.c common/ppc/deblock.c \
common/ppc/predict.c
endif
SRCS_X += common/ppc/dct.c \
common/ppc/deblock.c \
common/ppc/mc.c \
common/ppc/pixel.c \
common/ppc/predict.c \
common/ppc/quant.c
endif
# NEON optims
ifeq ($(SYS_ARCH),ARM)
ifneq ($(AS),)
ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
common/arm/predict-a.S common/arm/bitstream-a.S
SRCS += common/arm/mc-c.c common/arm/predict-c.c
OBJASM = $(ASMSRC:%.S=%.o)
OBJCHK += tools/checkasm-arm.o
SRCASM_X = common/arm/bitstream-a.S \
common/arm/dct-a.S \
common/arm/deblock-a.S \
common/arm/mc-a.S \
common/arm/pixel-a.S \
common/arm/predict-a.S \
common/arm/quant-a.S
SRCS_X += common/arm/mc-c.c \
common/arm/predict-c.c
OBJASM += common/arm/cpu-a.o
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK += tools/checkasm-arm.o
endif
# AArch64 NEON optims
ifeq ($(SYS_ARCH),AARCH64)
ifneq ($(AS),)
ASMSRC += common/aarch64/bitstream-a.S \
common/aarch64/cabac-a.S \
common/aarch64/dct-a.S \
common/aarch64/deblock-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/predict-a.S \
common/aarch64/quant-a.S
SRCS += common/aarch64/asm-offsets.c \
common/aarch64/mc-c.c \
common/aarch64/predict-c.c
OBJASM = $(ASMSRC:%.S=%.o)
OBJCHK += tools/checkasm-aarch64.o
SRCASM_X = common/aarch64/bitstream-a.S \
common/aarch64/cabac-a.S \
common/aarch64/dct-a.S \
common/aarch64/deblock-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/predict-a.S \
common/aarch64/quant-a.S
SRCS_X += common/aarch64/asm-offsets.c \
common/aarch64/mc-c.c \
common/aarch64/predict-c.c
OBJASM +=
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK += tools/checkasm-aarch64.o
endif
# MSA optims
ifeq ($(SYS_ARCH),MIPS)
ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
SRCS += common/mips/mc-c.c common/mips/dct-c.c \
common/mips/deblock-c.c common/mips/pixel-c.c \
common/mips/predict-c.c common/mips/quant-c.c
SRCS_X += common/mips/dct-c.c \
common/mips/deblock-c.c \
common/mips/mc-c.c \
common/mips/pixel-c.c \
common/mips/predict-c.c \
common/mips/quant-c.c
endif
endif
endif
ifneq ($(HAVE_GETOPT_LONG),1)
SRCCLI += extras/getopt.c
endif
......@@ -170,14 +216,28 @@ ifeq ($(HAVE_OPENCL),yes)
common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
cat $^ | $(SRCPATH)/tools/cltostr.sh $@
GENERATED += common/oclobj.h
SRCS += common/opencl.c encoder/slicetype-cl.c
SRCS_8 += common/opencl.c encoder/slicetype-cl.c
endif
OBJS += $(SRCS:%.c=%.o)
OBJCLI += $(SRCCLI:%.c=%.o)
OBJSO += $(SRCSO:%.c=%.o)
OBJEXAMPLE += $(SRCEXAMPLE:%.c=%.o)
.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJS += $(SRCS_X:%.c=%-8.o) $(SRCS_8:%.c=%-8.o)
OBJCLI += $(SRCCLI_X:%.c=%-8.o)
OBJCHK_8 += $(SRCCHK_X:%.c=%-8.o)
checkasm: checkasm8$(EXE)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJS += $(SRCS_X:%.c=%-10.o)
OBJCLI += $(SRCCLI_X:%.c=%-10.o)
OBJCHK_10 += $(SRCCHK_X:%.c=%-10.o)
checkasm: checkasm10$(EXE)
endif
.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* checkasm etags
cli: x264$(EXE)
lib-static: $(LIBX264)
......@@ -192,31 +252,60 @@ $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
ifneq ($(EXE),)
.PHONY: x264 checkasm example
.PHONY: x264 checkasm8 checkasm10 example
x264: x264$(EXE)
checkasm: checkasm$(EXE)
checkasm8: checkasm8$(EXE)
checkasm10: checkasm10$(EXE)
example: example$(EXE)
endif
x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
checkasm8$(EXE): $(GENERATED) .depend $(OBJCHK) $(OBJCHK_8) $(LIBX264)
$(LD)$@ $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LDFLAGS)
checkasm10$(EXE): $(GENERATED) .depend $(OBJCHK) $(OBJCHK_10) $(LIBX264)
$(LD)$@ $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LDFLAGS)
example$(EXE): $(GENERATED) .depend $(OBJEXAMPLE) $(LIBX264)
$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) $(OBJEXAMPLE): .depend
%.o: %.c
$(CC) $(CFLAGS) -c $< $(CC_O)
%-8.o: %.c
$(CC) $(CFLAGS) -c $< $(CC_O) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
%-10.o: %.c
$(CC) $(CFLAGS) -c $< $(CC_O) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $<
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
%-8.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $< -DBIT_DEPTH=8 -Dprivate_prefix=x264_8
-@ $(if $(STRIP), $(STRIP) -x $@)
%-10.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $< -DBIT_DEPTH=10 -Dprivate_prefix=x264_10
-@ $(if $(STRIP), $(STRIP) -x $@)
%.o: %.S
$(AS) $(ASFLAGS) -o $@ $<
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
%-8.o: %.S
$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
-@ $(if $(STRIP), $(STRIP) -x $@)
%-10.o: %.S
$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
-@ $(if $(STRIP), $(STRIP) -x $@)
%.dll.o: %.rc x264.h
$(RC) $(RCFLAGS)$@ -DDLL $<
......@@ -227,9 +316,21 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
@rm -f .depend
@echo 'dependency file generation...'
ifeq ($(COMPILER),CL)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO) $(SRCEXAMPLE)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;)
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS_X) $(SRCS_8) $(SRCCLI_X) $(SRCCHK_X)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%-8.o)" 1>> .depend;)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS_X) $(SRCCLI_X) $(SRCCHK_X)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%-10.o)" 1>> .depend;)
endif
else
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO) $(SRCEXAMPLE)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS_X) $(SRCS_8) $(SRCCLI_X) $(SRCCHK_X)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%-8.o) $(DEPMM) 1>> .depend;)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS_X) $(SRCCLI_X) $(SRCCHK_X)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%-10.o) $(DEPMM) 1>> .depend;)
endif
endif
config.mak:
......@@ -240,7 +341,7 @@ ifneq ($(wildcard .depend),)
include .depend
endif
SRC2 = $(SRCS) $(SRCCLI)
OBJPROF = $(OBJS) $(OBJCLI)
# These should cover most of the important codepaths
OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50
......@@ -265,17 +366,18 @@ ifeq ($(COMPILER),CL)
# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
rm -f x264$(EXE)
else
rm -f $(SRC2:%.c=%.o)
rm -f $(OBJPROF)
endif
$(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)"
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
rm -f example example.exe $(OBJEXAMPLE)
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(GENERATED) .depend TAGS
rm -f $(SONAME) *.a *.lib *.exp *.pdb x264$(EXE) x264_lookahead.clbin
rm -f checkasm8$(EXE) checkasm10$(EXE) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10)
rm -f example$(EXE) $(OBJEXAMPLE)
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
......@@ -319,4 +421,4 @@ endif
etags: TAGS
TAGS:
etags $(SRCS)
etags $(SRCS) $(SRCS_X) $(SRCS_8)
......@@ -27,12 +27,27 @@
#include "config.h"
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#ifdef PREFIX
# define EXTERN_ASM _x264_
# define BASE _x264_
# define SYM_PREFIX _
#else
# define BASE x264_
# define SYM_PREFIX
#endif
#ifdef BIT_DEPTH
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
#else
# define EXTERN_ASM x264_
# define EXTERN_ASM BASE
#endif
#define X(s) JOIN(EXTERN_ASM, s)
#define X264(s) JOIN(BASE, s)
#define EXT(s) JOIN(SYM_PREFIX, s)
#ifdef __ELF__
# define ELF
#else
......@@ -98,10 +113,6 @@ MACH .const_data
#endif
.endm
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
#define FDEC_STRIDE 32
#define FENC_STRIDE 16
......
/*****************************************************************************
* bitstream.h: aarch64 bitstream functions
*****************************************************************************
* Copyright (C) 2017 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_BITSTREAM_H
#define X264_AARCH64_BITSTREAM_H
#define x264_nal_escape_neon x264_template(nal_escape_neon)
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
......@@ -30,8 +30,8 @@
// w12 holds x264_cabac_t.i_range
function cabac_encode_decision_asm, export=1
movrel x8, X(cabac_range_lps)
movrel x9, X(cabac_transition)
movrel x8, X264(cabac_range_lps)
movrel x9, X264(cabac_transition)
add w10, w1, #CABAC_STATE
ldrb w3, [x0, x10] // i_state
ldr w12, [x0, #CABAC_I_RANGE]
......
......@@ -27,41 +27,68 @@
#ifndef X264_AARCH64_DCT_H
#define X264_AARCH64_DCT_H
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
void x264_dct4x4dc_neon( int16_t d[16] );
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
void x264_idct4x4dc_neon( int16_t d[16] );
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
/*****************************************************************************
* deblock.h: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2017 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_DEBLOCK_H
#define X264_AARCH64_DEBLOCK_H
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif
......@@ -27,47 +27,89 @@
#include "common/common.h"
#include "mc.h"
#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
void x264_memzero_aligned_neon( void *dst, size_t n );
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
......@@ -84,25 +126,39 @@ static void (* mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_
x264_mc_weight_w20##func##_neon,\
};