Compare revisions

Yin Shiyou · Yin Shiyou · Yin Shiyou · Yin Shiyou · Yin Shiyou · Yin Shiyou
--- a/Makefile
+++ b/Makefile
@@ -197,6 +197,32 @@ SRCS_X += common/mips/dct-c.c \
 endif
 endif

+# LOONGARCH optimization
+ifeq ($(SYS_ARCH),LOONGARCH)
+ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),)
+SRCASM_X += common/loongarch/deblock-a.S \
+            common/loongarch/sad-a.S \
+            common/loongarch/predict-a.S \
+            common/loongarch/quant-a.S \
+            common/loongarch/mc-a.S \
+            common/loongarch/dct-a.S \
+            common/loongarch/pixel-a.S
+
+SRCS_X += common/loongarch/predict-c.c \
+          common/loongarch/mc-c.c \
+          common/loongarch/pixel-c.c
+
+OBJASM +=
+ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
+OBJASM += $(SRCASM_X:%.S=%-8.o)
+endif
+ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
+OBJASM += $(SRCASM_X:%.S=%-10.o)
+endif
+
+endif
+endif
+
 endif

 ifneq ($(HAVE_GETOPT_LONG),1)

--- a/common/cpu.c
+++ b/common/cpu.c
@@ -98,6 +98,9 @@ const x264_cpu_name_t x264_cpu_names[] =
    {"NEON",            X264_CPU_NEON},
 #elif ARCH_MIPS
    {"MSA",             X264_CPU_MSA},
+#elif ARCH_LOONGARCH
+    {"LSX",             X264_CPU_LSX},
+    {"LASX",            X264_CPU_LASX},
 #endif
    {"", 0},
 };
@@ -431,6 +434,25 @@ uint32_t x264_cpu_detect( void )
    return X264_CPU_MSA;
 }

+#elif HAVE_LSX
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX    ( 1U << 4 )
+#define LA_HWCAP_LASX   ( 1U << 5 )
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t flags = 0;
+    uint32_t hwcap = (uint32_t)getauxval( AT_HWCAP );
+
+    if( hwcap & LA_HWCAP_LSX )
+        flags |= X264_CPU_LSX;
+    if( hwcap & LA_HWCAP_LASX )
+        flags |= X264_CPU_LASX;
+
+    return flags;
+}
+
 #else

 uint32_t x264_cpu_detect( void )

--- a/common/dct.c
+++ b/common/dct.c
@@ -41,7 +41,9 @@
 #if HAVE_MSA
 #   include "mips/dct.h"
 #endif
-
+#if HAVE_LSX
+#   include "loongarch/dct.h"
+#endif
 static void dct4x4dc( dctcoef d[16] )
 {
    dctcoef tmp[16];
@@ -727,6 +729,38 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf )
    }
 #endif

+#if HAVE_LSX
+    if( cpu&X264_CPU_LSX )
+    {
+        dctf->sub4x4_dct       = x264_sub4x4_dct_lsx;
+        dctf->add4x4_idct      = x264_add4x4_idct_lsx;
+        dctf->dct4x4dc         = x264_dct4x4dc_lsx;
+        dctf->idct4x4dc        = x264_idct4x4dc_lsx;
+        dctf->sub8x8_dct8      = x264_sub8x8_dct8_lsx;
+        dctf->sub8x8_dct       = x264_sub8x8_dct_lsx;
+        dctf->add8x8_idct      = x264_add8x8_idct_lsx;
+        dctf->add8x8_idct8     = x264_add8x8_idct8_lsx;
+        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_lsx;
+        dctf->add16x16_idct    = x264_add16x16_idct_lsx;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_lsx;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx;
+        dctf->sub16x16_dct8    = x264_sub16x16_dct8_lsx;
+    }
+    if( cpu&X264_CPU_LASX )
+    {
+        dctf->sub8x8_dct       = x264_sub8x8_dct_lasx;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_lasx;
+        dctf->add8x8_idct      = x264_add8x8_idct_lasx;
+        dctf->add8x8_idct8     = x264_add8x8_idct8_lasx;
+        dctf->add16x16_idct    = x264_add16x16_idct_lasx;
+        dctf->sub16x16_dct8    = x264_sub16x16_dct8_lasx;
+        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_lasx;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx;
+        dctf->dct4x4dc         = x264_dct4x4dc_lasx;
+        dctf->idct4x4dc        = x264_idct4x4dc_lasx;
+    }
+#endif
+
 #endif // HIGH_BIT_DEPTH
 }

@@ -1087,5 +1121,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
    }
 #endif
+
+#if HAVE_LSX
+    if( cpu&X264_CPU_LASX )
+    {
+        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_lasx;
+    }
+#endif
 #endif // !HIGH_BIT_DEPTH
 }
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -680,6 +680,9 @@ void x264_macroblock_deblock( x264_t *h )
 #if HAVE_MSA
 #include "mips/deblock.h"
 #endif
+#if HAVE_LSX
+#include "loongarch/deblock.h"
+#endif

 void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
 {
@@ -816,6 +819,24 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
        pf->deblock_strength = x264_deblock_strength_msa;
    }
 #endif
+
+#if HAVE_LSX
+    if( cpu&X264_CPU_LSX )
+    {
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx;
+        pf->deblock_strength = x264_deblock_strength_lsx;
+    }
+    if( cpu&X264_CPU_LASX )
+    {
+        pf->deblock_luma[1] = x264_deblock_v_luma_lasx;
+        pf->deblock_luma[0] = x264_deblock_h_luma_lasx;
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx;
+        pf->deblock_strength = x264_deblock_strength_lasx;
+    }
+#endif
+
 #endif // !HIGH_BIT_DEPTH

    /* These functions are equivalent, so don't duplicate them. */

--- a/common/loongarch/dct-a.S
+++ b/common/loongarch/dct-a.S
--- a/common/loongarch/dct.h
+++ b/common/loongarch/dct.h
+/*****************************************************************************
+ * dct.h: loongarch transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2023 x264 project
+ *
+ * Authors: Peng Zhou <zhoupeng@loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_LOONGARCH_DCT_H
+#define X264_LOONGARCH_DCT_H
+
+#define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx)
+void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
+#define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx)
+void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
+
+#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
+void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
+#define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx)
+void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1,
+                              uint8_t *p_pix2 );
+
+#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
+void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
+#define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx)
+void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] );
+#define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx)
+void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] );
+#define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx)
+void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] );
+#define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx)
+void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] );
+#define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx)
+void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] );
+
+#define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx)
+void x264_idct4x4dc_lasx( int16_t d[16] );
+#define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx)
+void x264_dct4x4dc_lasx( int16_t d[16] );
+
+#define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx)
+void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] );
+
+#define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx)
+void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
+#define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx)
+void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
+#define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx)
+void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
+
+#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
+void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
+#define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx)
+void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1,
+                              uint8_t *p_pix2 );
+
+#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
+void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
+#define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx)
+void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] );
+#define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx)
+void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] );
+#define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx)
+void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] );
+#define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx)
+void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] );
+#define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx)
+void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] );
+
+#define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx)
+void x264_idct4x4dc_lsx( int16_t d[16] );
+#define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx)
+void x264_dct4x4dc_lsx( int16_t d[16] );
+
+#define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx)
+void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] );
+
+#endif
--- a/common/loongarch/deblock-a.S
+++ b/common/loongarch/deblock-a.S
--- a/common/loongarch/deblock.h
+++ b/common/loongarch/deblock.h
+/*****************************************************************************
+ * deblock.h: loongarch deblock
+ *****************************************************************************
+ * Copyright (C) 2023 x264 project
+ *
+ * Authors: Hao Chen <chenhao@loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_LOONGARCH_DEBLOCK_H
+#define X264_LOONGARCH_DEBLOCK_H
+
+#if !HIGH_BIT_DEPTH
+#define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx)
+void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx)
+void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+
+#define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx)
+void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx)
+void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
+
+#define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx)
+void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx)
+void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_strength_lsx x264_template(deblock_strength_lsx)
+void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                int mvy_limit, int bframe );
+#define x264_deblock_strength_lasx x264_template(deblock_strength_lasx)
+void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                 int mvy_limit, int bframe );
+#endif
+
+#endif
--- a/common/loongarch/loongson_asm.S
+++ b/common/loongarch/loongson_asm.S
+/*********************************************************************
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
+ *                Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *********************************************************************/
+
+/*
+ * This file is a LoongArch assembly helper file and available under ISC
+ * license. It provides a large number of macros and alias to simplify
+ * writing assembly code, especially for LSX and LASX optimizations.
+ *
+ * Any one can modify it or add new features for his/her own purposes.
+ * Contributing a patch will be appreciated as it might be useful for
+ * others as well. Send patches to loongson contributor mentioned above.
+ *
+ * MAJOR version: Usage changes, incompatible with previous version.
+ * MINOR version: Add new macros/functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 2
+
+#define ASM_PREF
+#define DEFAULT_ALIGN    5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+    jirl    $r0, $r1, 0x0
+    .size ASM_PREF\name, . - ASM_PREF\name
+    .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type  ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+.macro  const name, align=DEFAULT_ALIGN
+    .macro endconst
+    .size  \name, . - \name
+    .purgem endconst
+    .endm
+.section .rodata
+.align   \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp   $sp
+#define ra   $ra
+
+#define f0  $f0
+#define f1  $f1
+#define f2  $f2
+#define f3  $f3
+#define f4  $f4
+#define f5  $f5
+#define f6  $f6
+#define f7  $f7
+#define f8  $f8
+#define f9  $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+    vmulwev.h.bu      \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+    vmulwev.h.bu.b    \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+    vmulwev.w.h       \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+    xvmulwev.h.bu    \xd,    \xj,    \xk
+    xvmaddwod.h.bu   \xd,    \xj,    \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+    xvmulwev.h.bu.b    \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+    xvmulwev.w.h       \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+    vmaddwev.h.bu     \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+    vmaddwev.h.bu.b   \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+    vmaddwev.w.h      \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+    xvmaddwev.w.h      \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h  vd,  vj, vk, va
+    vmax.h    \vd,  \vj,   \vk
+    vmin.h    \vd,  \vd,   \va
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
+.macro xvclip.h  xd,  xj, xk, xa
+    xvmax.h    \xd,  \xj,   \xk
+    xvmin.h    \xd,  \xd,   \xa
+.endm
+
+.macro xvclip255.h  xd, xj
+    xvmaxi.h   \xd,   \xj,  0
+    xvsat.hu   \xd,   \xd,  7
+.endm
+
+.macro xvclip255.w  xd, xj
+    xvmaxi.w   \xd,   \xj,  0
+    xvsat.wu   \xd,   \xd,  7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.b   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.h   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.w   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.d  vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.d   \vd,  \rk,  0, \si
+.endm
+
+.macro vmov xd, xj
+    vor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xmov xd, xj
+    xvor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xvstelmx.d  xd, rk, ra, si
+    add.d      \rk, \rk,  \ra
+    xvstelm.d  \xd, \rk,  0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.s     \out0,    \src,    0
+    fldx.s    \out1,    \src,    \stride
+    fldx.s    \out2,    \src,    \stride2
+    fldx.s    \out3,    \src,    \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.d     \out0,    \src,    0
+    fldx.d    \out1,    \src,    \stride
+    fldx.d    \out2,    \src,    \stride2
+    fldx.d    \out3,    \src,    \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    vld     \out0,    \src,    0
+    vldx    \out1,    \src,    \stride
+    vldx    \out2,    \src,    \stride2
+    vldx    \out3,    \src,    \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    xvld    \out0,    \src,    0
+    xvldx   \out1,    \src,    \stride
+    xvldx   \out2,    \src,    \stride2
+    xvldx   \out3,    \src,    \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+    vilvl.h   \tmp0,  \in1,   \in0
+    vilvl.h   \tmp1,  \in3,   \in2
+    vilvl.w   \out0,  \tmp1,  \tmp0
+    vilvh.w   \out2,  \tmp1,  \tmp0
+    vilvh.d   \out1,  \out0,  \out0
+    vilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+
+    vilvl.w    \tmp0,   \in1,    \in0
+    vilvh.w    \out1,   \in1,    \in0
+    vilvl.w    \tmp1,   \in3,    \in2
+    vilvh.w    \out3,   \in3,    \in2
+
+    vilvl.d    \out0,   \tmp1,   \tmp0
+    vilvl.d    \out2,   \out3,   \out1
+    vilvh.d    \out3,   \out3,   \out1
+    vilvh.d    \out1,   \tmp1,   \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+                          tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.h      \tmp0,    \in6,   \in4
+    vilvl.h      \tmp1,    \in7,   \in5
+    vilvl.h      \tmp2,    \in2,   \in0
+    vilvl.h      \tmp3,    \in3,   \in1
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vilvh.h      \tmp0,    \in6,   \in4
+    vilvh.h      \tmp1,    \in7,   \in5
+    vilvh.h      \tmp2,    \in2,   \in0
+    vilvh.h      \tmp3,    \in3,   \in1
+
+    vpickev.d    \out0,    \tmp4,  \tmp6
+    vpickod.d    \out1,    \tmp4,  \tmp6
+    vpickev.d    \out2,    \tmp5,  \tmp7
+    vpickod.d    \out3,    \tmp5,  \tmp7
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vpickev.d    \out4,    \tmp4,  \tmp6
+    vpickod.d    \out5,    \tmp4,  \tmp6
+    vpickev.d    \out6,    \tmp5,  \tmp7
+    vpickod.d    \out7,    \tmp5,  \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5, out6, out7,\
+                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.b   \tmp0,    \in2,     \in0
+    xvilvl.b   \tmp1,    \in3,     \in1
+    xvilvl.b   \tmp2,    \in6,     \in4
+    xvilvl.b   \tmp3,    \in7,     \in5
+    xvilvl.b   \tmp4,    \in10,    \in8
+    xvilvl.b   \tmp5,    \in11,    \in9
+    xvilvl.b   \tmp6,    \in14,    \in12
+    xvilvl.b   \tmp7,    \in15,    \in13
+    xvilvl.b   \out0,    \tmp1,    \tmp0
+    xvilvh.b   \out1,    \tmp1,    \tmp0
+    xvilvl.b   \out2,    \tmp3,    \tmp2
+    xvilvh.b   \out3,    \tmp3,    \tmp2
+    xvilvl.b   \out4,    \tmp5,    \tmp4
+    xvilvh.b   \out5,    \tmp5,    \tmp4
+    xvilvl.b   \out6,    \tmp7,    \tmp6
+    xvilvh.b   \out7,    \tmp7,    \tmp6
+    xvilvl.w   \tmp0,    \out2,    \out0
+    xvilvh.w   \tmp2,    \out2,    \out0
+    xvilvl.w   \tmp4,    \out3,    \out1
+    xvilvh.w   \tmp6,    \out3,    \out1
+    xvilvl.w   \tmp1,    \out6,    \out4
+    xvilvh.w   \tmp3,    \out6,    \out4
+    xvilvl.w   \tmp5,    \out7,    \out5
+    xvilvh.w   \tmp7,    \out7,    \out5
+    xvilvl.d   \out0,    \tmp1,    \tmp0
+    xvilvh.d   \out1,    \tmp1,    \tmp0
+    xvilvl.d   \out2,    \tmp3,    \tmp2
+    xvilvh.d   \out3,    \tmp3,    \tmp2
+    xvilvl.d   \out4,    \tmp5,    \tmp4
+    xvilvh.d   \out5,    \tmp5,    \tmp4
+    xvilvl.d   \out6,    \tmp7,    \tmp6
+    xvilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h   \tmp0,  \in1,   \in0
+    xvilvl.h   \tmp1,  \in3,   \in2
+    xvilvl.w   \out0,  \tmp1,  \tmp0
+    xvilvh.w   \out2,  \tmp1,  \tmp0
+    xvilvh.d   \out1,  \out0,  \out0
+    xvilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h      \tmp0,    \in2,   \in0
+    xvilvl.h      \tmp1,    \in3,   \in1
+    xvilvl.h      \out2,    \tmp1,  \tmp0
+    xvilvh.h      \out3,    \tmp1,  \tmp0
+
+    xvilvl.d      \out0,    \out2,  \out2
+    xvilvh.d      \out1,    \out2,  \out2
+    xvilvl.d      \out2,    \out3,  \out3
+    xvilvh.d      \out3,    \out3,  \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
+                           out0, out1, out2, out3, out4, out5, out6, out7, \
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.h     \tmp0,   \in6,     \in4
+    xvilvl.h     \tmp1,   \in7,     \in5
+    xvilvl.h     \tmp2,   \in2,     \in0
+    xvilvl.h     \tmp3,   \in3,     \in1
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvilvh.h     \tmp0,   \in6,     \in4
+    xvilvh.h     \tmp1,   \in7,     \in5
+    xvilvh.h     \tmp2,   \in2,     \in0
+    xvilvh.h     \tmp3,   \in3,     \in1
+
+    xvpickev.d   \out0,   \tmp4,    \tmp6
+    xvpickod.d   \out1,   \tmp4,    \tmp6
+    xvpickev.d   \out2,   \tmp5,    \tmp7
+    xvpickod.d   \out3,   \tmp5,    \tmp7
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvpickev.d   \out4,   \tmp4,    \tmp6
+    xvpickod.d   \out5,   \tmp4,    \tmp6
+    xvpickev.d   \out6,   \tmp5,    \tmp7
+    xvpickod.d   \out7,   \tmp5,    \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                             tmp0, tmp1, tmp2
+    xvilvh.h   \tmp1,    \in0,     \in1
+    xvilvl.h   \out1,    \in0,     \in1
+    xvilvh.h   \tmp0,    \in2,     \in3
+    xvilvl.h   \out3,    \in2,     \in3
+
+    xvilvh.w   \tmp2,    \out3,    \out1
+    xvilvl.w   \out3,    \out3,    \out1
+
+    xvilvl.w   \out2,    \tmp0,    \tmp1
+    xvilvh.w   \tmp1,    \tmp0,    \tmp1
+
+    xvilvh.d   \out0,    \out2,    \out3
+    xvilvl.d   \out2,    \out2,    \out3
+    xvilvh.d   \out1,    \tmp1,    \tmp2
+    xvilvl.d   \out3,    \tmp1,    \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
+ *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
+ *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+
+    xvilvl.w    \tmp0,   \in1,    \in0
+    xvilvh.w    \out1,   \in1,    \in0
+    xvilvl.w    \tmp1,   \in3,    \in2
+    xvilvh.w    \out3,   \in3,    \in2
+
+    xvilvl.d    \out0,   \tmp1,   \tmp0
+    xvilvl.d    \out2,   \out3,   \out1
+    xvilvh.d    \out3,   \out3,   \out1
+    xvilvh.d    \out1,   \tmp1,   \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *         in0 : 1,2,3,4,5,6,7,8
+ *         in1 : 2,2,3,4,5,6,7,8
+ *         in2 : 3,2,3,4,5,6,7,8
+ *         in3 : 4,2,3,4,5,6,7,8
+ *         in4 : 5,2,3,4,5,6,7,8
+ *         in5 : 6,2,3,4,5,6,7,8
+ *         in6 : 7,2,3,4,5,6,7,8
+ *         in7 : 8,2,3,4,5,6,7,8
+ *
+ *        out0 : 1,2,3,4,5,6,7,8
+ *        out1 : 2,2,2,2,2,2,2,2
+ *        out2 : 3,3,3,3,3,3,3,3
+ *        out3 : 4,4,4,4,4,4,4,4
+ *        out4 : 5,5,5,5,5,5,5,5
+ *        out5 : 6,6,6,6,6,6,6,6
+ *        out6 : 7,7,7,7,7,7,7,7
+ *        out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
+                           out0, out1, out2, out3, out4, out5, out6, out7,\
+                           tmp0, tmp1, tmp2, tmp3
+    xvilvl.w    \tmp0,   \in2,    \in0
+    xvilvl.w    \tmp1,   \in3,    \in1
+    xvilvh.w    \tmp2,   \in2,    \in0
+    xvilvh.w    \tmp3,   \in3,    \in1
+    xvilvl.w    \out0,   \tmp1,   \tmp0
+    xvilvh.w    \out1,   \tmp1,   \tmp0
+    xvilvl.w    \out2,   \tmp3,   \tmp2
+    xvilvh.w    \out3,   \tmp3,   \tmp2
+
+    xvilvl.w    \tmp0,   \in6,    \in4
+    xvilvl.w    \tmp1,   \in7,    \in5
+    xvilvh.w    \tmp2,   \in6,    \in4
+    xvilvh.w    \tmp3,   \in7,    \in5
+    xvilvl.w    \out4,   \tmp1,   \tmp0
+    xvilvh.w    \out5,   \tmp1,   \tmp0
+    xvilvl.w    \out6,   \tmp3,   \tmp2
+    xvilvh.w    \out7,   \tmp3,   \tmp2
+
+    xmov        \tmp0,   \out0
+    xmov        \tmp1,   \out1
+    xmov        \tmp2,   \out2
+    xmov        \tmp3,   \out3
+    xvpermi.q   \out0,   \out4,   0x02
+    xvpermi.q   \out1,   \out5,   0x02
+    xvpermi.q   \out2,   \out6,   0x02
+    xvpermi.q   \out3,   \out7,   0x02
+    xvpermi.q   \out4,   \tmp0,   0x31
+    xvpermi.q   \out5,   \tmp1,   0x31
+    xvpermi.q   \out6,   \tmp2,   0x31
+    xvpermi.q   \out7,   \tmp3,   0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *         in0 : 1,2,3,4
+ *         in1 : 1,2,3,4
+ *         in2 : 1,2,3,4
+ *         in3 : 1,2,3,4
+ *
+ *        out0 : 1,1,1,1
+ *        out1 : 2,2,2,2
+ *        out2 : 3,3,3,3
+ *        out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.d    \tmp0,   \in1,    \in0
+    xvilvh.d    \out1,   \in1,    \in0
+    xvilvh.d    \tmp1,   \in3,    \in2
+    xvilvl.d    \out2,   \in3,    \in2
+
+    xvor.v      \out0,   \tmp0,   \tmp0
+    xvor.v      \out3,   \tmp1,   \tmp1
+
+    xvpermi.q   \out0,   \out2,   0x02
+    xvpermi.q   \out2,   \tmp0,   0x31
+    xvpermi.q   \out3,   \out1,   0x31
+    xvpermi.q   \out1,   \tmp1,   0x02
+.endm
--- a/common/loongarch/loongson_util.S
+++ b/common/loongarch/loongson_util.S
+/*****************************************************************************
+ * loongson_util.S: loongson utility macros
+ *****************************************************************************
+ * Copyright (C) 2023 x264 project
+ *
+ * Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *          Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
+/* Set prefix as needed. */
+#define  ASM_REF  JOIN(JOIN(x264_, BIT_DEPTH), _)
+
+#define FENC_STRIDE      16
+#define FDEC_STRIDE      32
+
+.macro function_x264 name, align=DEFAULT_ALIGN
+.macro endfunc_x264
+    jirl    $r0, $r1, 0x0
+    .size ASM_REF\name, . - ASM_REF\name
+    .purgem endfunc_x264
+.endm
+.text ;
+.align \align ;
+.globl ASM_REF\name ;
+.type  ASM_REF\name, @function ;
+ASM_REF\name: ;
+.endm
--- a/common/loongarch/mc-a.S
+++ b/common/loongarch/mc-a.S
--- a/common/loongarch/mc-c.c
+++ b/common/loongarch/mc-c.c
--- a/common/loongarch/mc.h
+++ b/common/loongarch/mc.h
+/*****************************************************************************
+ * mc.h: loongarch motion compensation
+ *****************************************************************************
+ * Copyright (C) 2023 x264 project
+ *
+ * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_LOONGARCH_MC_H
+#define X264_LOONGARCH_MC_H
+
+#define x264_mc_init_loongarch x264_template(mc_init_loongarch)
+void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf );
+
+#define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx)
+void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx)
+void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx)
+void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx)
+void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx)
+void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx)
+void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx)
+void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx)
+void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx)
+void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx)
+void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx)
+void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx)
+void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx)
+void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+#define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx)
+void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx)
+void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx)
+void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx)
+void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx)
+void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx)
+void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx)
+void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx)
+void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+
+#define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx)
+void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx)
+void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx)
+void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx)
+void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+#define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx)
+void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx)
+void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+#define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx)
+void x264_plane_copy_interleave_core_lsx( pixel *dst,  intptr_t i_dst,
+                                          pixel *srcu, intptr_t i_srcu,
+                                          pixel *srcv, intptr_t i_srcv, int w, int h );
+#define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx)
+void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu,
+                                       pixel *dstv, intptr_t i_dstv,
+                                       pixel *src,  intptr_t i_src, int w, int h );
+
+#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
+void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
+                                        pixel *dstv, intptr_t i_dstv,
+                                        pixel *src,  intptr_t i_src, int w, int h );
+
+#define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx)
+void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y,
+                                 uint8_t *pix_uv, intptr_t stride_uv,
+                                 int32_t mb_x );
+#define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx)
+void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y,
+                                 uint8_t *pix_uv, intptr_t stride_uv,
+                                 int32_t mb_x );
+#define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx)
+void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity );
+
+#define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx)
+void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n );
+#define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx)
+void x264_memzero_aligned_lsx( void *p_dst, size_t n );
+
+#define x264_hpel_filter_lsx x264_template(hpel_filter_lsx)
+void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
+#define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx)
+void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *,
+                                      uint8_t *, intptr_t, intptr_t, int, int );
+
+#define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx)
+void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx)
+void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx)
+void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx)
+void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx)
+void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx)
+void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx)
+void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx)
+void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx)
+void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx)
+void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx)
+void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx)
+void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+#define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx)
+void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx)
+void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx)
+void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx)
+void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx)
+void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx)
+void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx)
+void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+#define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx)
+void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
+
+#define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx)
+void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx)
+void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx)
+void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx)
+void x264_plane_copy_interleave_core_lasx( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+
+#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
+void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
+                                        pixel *dstv, intptr_t i_dstv,
+                                        pixel *src,  intptr_t i_src, int w, int h );
+
+#define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx)
+void x264_memzero_aligned_lasx( void *p_dst, size_t n );
+
+#define x264_hpel_filter_lasx x264_template(hpel_filter_lasx)
+void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
+#define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx)
+void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *,
+                                       intptr_t, intptr_t, int, int );
+
+#endif
--- a/common/loongarch/pixel-a.S
+++ b/common/loongarch/pixel-a.S
--- a/common/loongarch/pixel-c.c
+++ b/common/loongarch/pixel-c.c
--- a/common/loongarch/pixel.h
+++ b/common/loongarch/pixel.h
--- a/common/loongarch/predict-a.S
+++ b/common/loongarch/predict-a.S
--- a/common/loongarch/predict-c.c
+++ b/common/loongarch/predict-c.c
--- a/common/loongarch/predict.h
+++ b/common/loongarch/predict.h
--- a/common/loongarch/quant-a.S
+++ b/common/loongarch/quant-a.S
No results found