Compare revisions

guxiwei · guxiwei · guxiwei · guxiwei · guxiwei · guxiwei
--- a/Makefile
+++ b/Makefile
@@ -229,6 +229,7 @@ ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-10.o)
 endif

+OBJCHK += tools/checkasm-loongarch.o
 endif
 endif


--- a/common/loongarch/loongson_asm.S
+++ b/common/loongarch/loongson_asm.S
 /*********************************************************************
- * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Copyright (c) 2022-2024 Loongson Technology Corporation Limited
 * Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
 *                Shiyou Yin <yinshiyou-hf@loongson.cn>
 *
@@ -31,12 +31,19 @@
 */

 #define LML_VERSION_MAJOR 0
-#define LML_VERSION_MINOR 2
-#define LML_VERSION_MICRO 2
+#define LML_VERSION_MINOR 4
+#define LML_VERSION_MICRO 0

 #define ASM_PREF
 #define DEFAULT_ALIGN    5

+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+
 .macro function name, align=DEFAULT_ALIGN
 .macro endfunc
    jirl    $r0, $r1, 0x0
@@ -99,6 +106,39 @@ ASM_PREF\name: ;
 #define sp   $sp
 #define ra   $ra

+#define fa0  $fa0
+#define fa1  $fa1
+#define fa2  $fa2
+#define fa3  $fa3
+#define fa4  $fa4
+#define fa5  $fa5
+#define fa6  $fa6
+#define fa7  $fa7
+#define ft0  $ft0
+#define ft1  $ft1
+#define ft2  $ft2
+#define ft3  $ft3
+#define ft4  $ft4
+#define ft5  $ft5
+#define ft6  $ft6
+#define ft7  $ft7
+#define ft8  $ft8
+#define ft9  $ft9
+#define ft10 $ft10
+#define ft11 $ft11
+#define ft12 $ft12
+#define ft13 $ft13
+#define ft14 $ft14
+#define ft15 $ft15
+#define fs0  $fs0
+#define fs1  $fs1
+#define fs2  $fs2
+#define fs3  $fs3
+#define fs4  $fs4
+#define fs5  $fs5
+#define fs6  $fs6
+#define fs7  $fs7
+
 #define f0  $f0
 #define f1  $f1
 #define f2  $f2
@@ -272,18 +312,17 @@ ASM_PREF\name: ;
 .endm

 /*
- * Description : Range each element of vector
+ * Description : Range element vj[i] to vk[i] ~ vj[i]
 * clip: vj > vk ? vj : vk && vj < va ? vj : va
- * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
 */
 .macro vclip.h  vd,  vj, vk, va
    vmax.h    \vd,  \vj,   \vk
    vmin.h    \vd,  \vd,   \va
 .endm

-.macro vclip255.w  vd, vj
-    vmaxi.w   \vd,   \vj,  0
-    vsat.wu   \vd,   \vd,  7
+.macro vclip.w  vd,  vj, vk, va
+    vmax.w    \vd,  \vj,   \vk
+    vmin.w    \vd,  \vd,   \va
 .endm

 .macro xvclip.h  xd,  xj, xk, xa
@@ -291,6 +330,25 @@ ASM_PREF\name: ;
    xvmin.h    \xd,  \xd,   \xa
 .endm

+.macro xvclip.w  xd,  xj, xk, xa
+    xvmax.w    \xd,  \xj,   \xk
+    xvmin.w    \xd,  \xd,   \xa
+.endm
+
+/*
+ * Description : Range element vj[i] to 0 ~ 255
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip255.h  vd, vj
+    vmaxi.h   \vd,   \vj,  0
+    vsat.hu   \vd,   \vd,  7
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
 .macro xvclip255.h  xd, xj
    xvmaxi.h   \xd,   \xj,  0
    xvsat.hu   \xd,   \xd,  7

--- a/common/loongarch/pixel-a.S
+++ b/common/loongarch/pixel-a.S
@@ -1438,6 +1438,9 @@ endfunc_x264
 *                           const Pixel *pix2, intptr_t i_pix2)
 */
 function_x264 pixel_sa8d_16x16_lasx
+    addi.d          sp,    sp,   -8
+    fst.d           f24,   sp,   0
+
    slli.d          t2,    a1,   1
    slli.d          t3,    a3,   1
    add.d           t4,    a1,   t2
@@ -1753,6 +1756,9 @@ function_x264 pixel_sa8d_16x16_lasx
    add.d           t4,    t4,   t5
    addi.d          t4,    t4,   2
    srli.d          a0,    t4,   2
+
+    fld.d           f24,   sp,   0
+    addi.d          sp,    sp,   8
 endfunc_x264

 /*

--- a/common/loongarch/quant-a.S
+++ b/common/loongarch/quant-a.S
@@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx
    jirl            $r0,    $r1,   0x0
 .END_SCORE_64_LSX:
 endfunc_x264
+
+/*
+ * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
+ */
+function_x264 coeff_level_run16_lasx
+    addi.w          t0,     zero,  15
+
+    xvld            xr0,    a0,    0
+    xvldi           xr2,    1
+
+    xvssrlni.bu.h   xr0,    xr0,   0
+    xvpermi.d       xr1,    xr0,   0xd8
+    xvsle.bu        xr3,    xr2,   xr1
+    xvsrlni.b.h     xr3,    xr3,   4
+    xvpickve2gr.du  t8,     xr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN16_LASX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN16_LASX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN16_LASX
+.END_COEFF_LEVEL_RUN16_LASX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lasx
+    addi.w          t0,     zero,  15
+
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    xvldi           xr3,    1
+
+    vinsgr2vr.h     vr1,    zero,  7
+    xvpermi.q       xr1,    xr0,   0x20
+
+    xvssrlni.bu.h   xr1,    xr1,   0
+    xvpermi.d       xr2,    xr1,   0xd8
+    xvsle.bu        xr4,    xr3,   xr2
+    xvsrlni.b.h     xr4,    xr4,   4
+    xvpickve2gr.du  t8,     xr4,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN15_LASX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN15_LASX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN15_LASX
+.END_COEFF_LEVEL_RUN15_LASX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run16_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    vldi            vr2,    1
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vssrlni.bu.h    vr1,    vr1,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN16_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN16_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN16_LSX
+.END_COEFF_LEVEL_RUN16_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    vldi            vr2,    1
+    vinsgr2vr.h     vr1,    zero,  7
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vssrlni.bu.h    vr1,    vr1,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN15_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN15_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN15_LSX
+.END_COEFF_LEVEL_RUN15_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run8_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vxor.v          vr1,    vr1,   vr1
+    vldi            vr2,    1
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN8_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN8_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN8_LSX
+.END_COEFF_LEVEL_RUN8_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
--- a/common/loongarch/quant.h
+++ b/common/loongarch/quant.h
@@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
 void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );

+#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
+int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
+int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
+
+#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
+int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
+int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
+int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
+
 #endif/* X264_LOONGARCH_QUANT_H */
--- a/common/quant.c
+++ b/common/quant.c
@@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
        pf->dequant_4x4    = x264_dequant_4x4_lsx;
        pf->dequant_8x8    = x264_dequant_8x8_lsx;
        pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx;
-        pf->coeff_last4    = x264_coeff_last4_lsx;
-        pf->coeff_last8    = x264_coeff_last8_lsx;
+        pf->decimate_score15 = x264_decimate_score15_lsx;
+        pf->decimate_score16 = x264_decimate_score16_lsx;
+        pf->decimate_score64 = x264_decimate_score64_lsx;
+        pf->coeff_last4              = x264_coeff_last4_lsx;
+        pf->coeff_last8              = x264_coeff_last8_lsx;
        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx;
        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx;
        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx;
+        pf->coeff_level_run8         = x264_coeff_level_run8_lsx;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx;
    }
    if( cpu&X264_CPU_LASX )
    {
@@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx;
        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx;
        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx;
    }
 #endif


--- a/tools/checkasm-loongarch.S
+++ b/tools/checkasm-loongarch.S
+/****************************************************************************
+ * checkasm-loongarch.S: assembly check tool
+ *****************************************************************************
+ * Copyright (C) 2024 x264 project
+ *
+ * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "../common/loongarch/loongson_asm.S"
+
+const register_init, align=3
+.quad 0x21f86d66c8ca00ce
+.quad 0x75b6ba21077c48ad
+.quad 0xed56bb2dcb3c7736
+.quad 0x8bda43d3fd1a7e06
+.quad 0xb64a9c9e5d318408
+.quad 0xdf9a54b303f1d3a3
+.quad 0x4a75479abd64e097
+.quad 0x249214109d5d1c88
+.quad 0x1a1b2550a612b48c
+.quad 0x79445c159ce79064
+.quad 0x2eed899d5a28ddcd
+.quad 0x86b2536fcd8cf636
+.quad 0xb0856806085e7943
+.quad 0x3f2bf84fc0fcca4e
+.quad 0xacbd382dcf5b8de2
+.quad 0xd229e1f5b281303f
+.quad 0x71aeaff20b095fd9
+endconst
+
+const error_message
+.asciz "failed to preserve register"
+endconst
+
+.text
+
+// max number of args used by any x264 asm function.
+#define MAX_ARGS 15
+
+#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
+
+// Fill dirty data at stack space
+function x264_checkasm_stack_clobber
+    move     t0,  sp
+    addi.d   t1,  zero, CLOBBER_STACK
+1:
+    st.d     a0,  sp,  0x00
+    st.d     a1,  sp, -0x08
+    addi.d   sp,  sp, -0x10
+    addi.d   t1,  t1, -0x10
+    blt      zero,t1,  1b
+    move     sp,  t0
+endfunc
+
+#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
+
+function x264_checkasm_call
+    // Saved s0 - s8, fs0 - fs7
+    move     t4,  sp
+    addi.d   sp,  sp, -136
+    st.d     s0,  sp, 0
+    st.d     s1,  sp, 8
+    st.d     s2,  sp, 16
+    st.d     s3,  sp, 24
+    st.d     s4,  sp, 32
+    st.d     s5,  sp, 40
+    st.d     s6,  sp, 48
+    st.d     s7,  sp, 56
+    st.d     s8,  sp, 64
+    fst.d    fs0, sp, 72
+    fst.d    fs1, sp, 80
+    fst.d    fs2, sp, 88
+    fst.d    fs3, sp, 96
+    fst.d    fs4, sp, 104
+    fst.d    fs5, sp, 112
+    fst.d    fs6, sp, 120
+    fst.d    fs7, sp, 128
+
+    la.local t1,  register_init
+    ld.d     s0,  t1, 0
+    ld.d     s1,  t1, 8
+    ld.d     s2,  t1, 16
+    ld.d     s3,  t1, 24
+    ld.d     s4,  t1, 32
+    ld.d     s5,  t1, 40
+    ld.d     s6,  t1, 48
+    ld.d     s7,  t1, 56
+    ld.d     s8,  t1, 64
+    fld.d    fs0, t1, 72
+    fld.d    fs1, t1, 80
+    fld.d    fs2, t1, 88
+    fld.d    fs3, t1, 96
+    fld.d    fs4, t1, 104
+    fld.d    fs5, t1, 112
+    fld.d    fs6, t1, 120
+    fld.d    fs7, t1, 128
+
+    addi.d   sp,  sp, -16
+    st.d     a1,  sp, 0 // ok
+    st.d     ra,  sp, 8 // Ret address
+
+    addi.d   sp,  sp, -ARG_STACK
+
+    addi.d   t0,  zero, 8*8
+    xor      t1,  t1, t1
+.rept MAX_ARGS - 8
+    // Skip the first 8 args, that are loaded into registers
+    ldx.d    t2,  t4, t0
+    stx.d    t2,  sp, t1
+    addi.d   t0,  t0, 8
+    addi.d   t1,  t1, 8
+.endr
+    move     t3,  a0 // Func
+    ld.d     a0,  t4, 0
+    ld.d     a1,  t4, 8
+    ld.d     a2,  t4, 16
+    ld.d     a3,  t4, 24
+    ld.d     a4,  t4, 32
+    ld.d     a5,  t4, 40
+    ld.d     a6,  t4, 48
+    ld.d     a7,  t4, 56
+
+    jirl     ra,  t3, 0
+
+    addi.d   sp,  sp, ARG_STACK
+    ld.d     t2,  sp, 0 // ok
+    ld.d     ra,  sp, 8 // Ret address
+    addi.d   sp,  sp, 16
+
+    la.local t1,  register_init
+    xor      t3,  t3, t3
+
+.macro check_reg_gr reg1
+    ld.d     t0,  t1, 0
+    xor      t0,  $s\reg1, t0
+    or       t3,  t3, t0
+    addi.d   t1,  t1, 8
+.endm
+    check_reg_gr 0
+    check_reg_gr 1
+    check_reg_gr 2
+    check_reg_gr 3
+    check_reg_gr 4
+    check_reg_gr 5
+    check_reg_gr 6
+    check_reg_gr 7
+    check_reg_gr 8
+
+.macro check_reg_fr reg1
+    ld.d     t0,  t1, 0
+    movfr2gr.d t4,$fs\reg1
+    xor      t0,  t0, t4
+    or       t3,  t3, t0
+    addi.d   t1,  t1, 8
+.endm
+    check_reg_fr 0
+    check_reg_fr 1
+    check_reg_fr 2
+    check_reg_fr 3
+    check_reg_fr 4
+    check_reg_fr 5
+    check_reg_fr 6
+    check_reg_fr 7
+
+    beqz     t3,  0f
+
+    st.d     zero,t2, 0x00 // Set OK to 0
+    la.local a0,  error_message
+    addi.d   sp,  sp, -8
+    st.d     ra,  sp, 0
+    bl       puts
+    ld.d     ra,  sp, 0
+    addi.d   sp,  sp, 8
+0:
+    ld.d     s0,  sp, 0
+    ld.d     s1,  sp, 8
+    ld.d     s2,  sp, 16
+    ld.d     s3,  sp, 24
+    ld.d     s4,  sp, 32
+    ld.d     s5,  sp, 40
+    ld.d     s6,  sp, 48
+    ld.d     s7,  sp, 56
+    ld.d     s8,  sp, 64
+    fld.d    fs0, sp, 72
+    fld.d    fs1, sp, 80
+    fld.d    fs2, sp, 88
+    fld.d    fs3, sp, 96
+    fld.d    fs4, sp, 104
+    fld.d    fs5, sp, 112
+    fld.d    fs6, sp, 120
+    fld.d    fs7, sp, 128
+    addi.d   sp,  sp, 136
+endfunc
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -274,6 +274,10 @@ intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
 intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
 #endif

+#if ARCH_LOONGARCH
+intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
+#endif
+
 #define call_c1(func,...) func(__VA_ARGS__)

 #if HAVE_MMX && ARCH_X86_64
@@ -300,6 +304,12 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
    x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
 #elif HAVE_MMX || HAVE_ARMV6
 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
+#elif ARCH_LOONGARCH && HAVE_LSX
+void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#define call_a1(func,...) ({ \
+    uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+    x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \
+    x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
 #else
 #define call_a1 call_c1
 #endif
No results found