Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
57618eea
Commit
57618eea
authored
Jun 18, 2015
by
Rishikesh More
Committed by
Henrik Gramner
Jul 25, 2015
Browse files
mips: MSA dct optimizations
Signed-off-by:
Rishikesh More
<
rishikesh.more@imgtec.com
>
parent
4ebb23aa
Changes
4
Hide whitespace changes
Inline
Side-by-side
Makefile
View file @
57618eea
...
...
@@ -146,7 +146,7 @@ endif
# MSA optims
ifeq
($(SYS_ARCH),MIPS)
ifneq
($(findstring HAVE_MSA 1, $(CONFIG)),)
SRCS
+=
common/mips/mc-c.c
SRCS
+=
common/mips/mc-c.c
common/mips/dct-c.c
endif
endif
...
...
common/dct.c
View file @
57618eea
...
...
@@ -38,6 +38,9 @@
#if ARCH_AARCH64
# include "aarch64/dct.h"
#endif
#if ARCH_MIPS
# include "mips/dct.h"
#endif
/* the inverse of the scaling factors introduced by 8x8 fdct */
/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
...
...
@@ -752,6 +755,27 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
#endif
}
#endif
#if HAVE_MSA
if
(
cpu
&
X264_CPU_MSA
)
{
dctf
->
sub4x4_dct
=
x264_sub4x4_dct_msa
;
dctf
->
sub8x8_dct
=
x264_sub8x8_dct_msa
;
dctf
->
sub16x16_dct
=
x264_sub16x16_dct_msa
;
dctf
->
sub8x8_dct_dc
=
x264_sub8x8_dct_dc_msa
;
dctf
->
sub8x16_dct_dc
=
x264_sub8x16_dct_dc_msa
;
dctf
->
dct4x4dc
=
x264_dct4x4dc_msa
;
dctf
->
idct4x4dc
=
x264_idct4x4dc_msa
;
dctf
->
add4x4_idct
=
x264_add4x4_idct_msa
;
dctf
->
add8x8_idct
=
x264_add8x8_idct_msa
;
dctf
->
add8x8_idct_dc
=
x264_add8x8_idct_dc_msa
;
dctf
->
add16x16_idct
=
x264_add16x16_idct_msa
;
dctf
->
add16x16_idct_dc
=
x264_add16x16_idct_dc_msa
;
dctf
->
add8x8_idct8
=
x264_add8x8_idct8_msa
;
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_msa
;
}
#endif
#endif // HIGH_BIT_DEPTH
}
...
...
@@ -1072,4 +1096,12 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
}
#endif // ARCH_AARCH64
#endif // !HIGH_BIT_DEPTH
#if !HIGH_BIT_DEPTH
#if HAVE_MSA
if
(
cpu
&
X264_CPU_MSA
)
{
pf_progressive
->
scan_4x4
=
x264_zigzag_scan_4x4_frame_msa
;
}
#endif
#endif
}
common/mips/dct-c.c
0 → 100644
View file @
57618eea
/*****************************************************************************
* dct-c.c: msa transform and zigzag
*****************************************************************************
* Copyright (C) 2015 x264 project
*
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include
"common/common.h"
#include
"macros.h"
#if !HIGH_BIT_DEPTH
#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \
{ \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
tmp0_m = in0 + in2; \
tmp1_m = in0 - in2; \
tmp2_m = in1 >> 1; \
tmp2_m = tmp2_m - in3; \
tmp3_m = in3 >> 1; \
tmp3_m = in1 + tmp3_m; \
\
BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \
}
static
void
avc_dct4x4dc_msa
(
int16_t
*
p_src
,
int16_t
*
p_dst
,
int32_t
i_src_stride
)
{
v8i16
src0
,
src1
,
src2
,
src3
,
ver_res0
,
ver_res1
,
ver_res2
,
ver_res3
;
v4i32
src0_r
,
src1_r
,
src2_r
,
src3_r
,
tmp0
,
tmp1
,
tmp2
,
tmp3
;
v4i32
hor_res0
,
hor_res1
,
hor_res2
,
hor_res3
;
v4i32
ver_res0_r
,
ver_res1_r
,
ver_res2_r
,
ver_res3_r
;
LD_SH4
(
p_src
,
i_src_stride
,
src0
,
src1
,
src2
,
src3
);
UNPCK_R_SH_SW
(
src0
,
src0_r
);
UNPCK_R_SH_SW
(
src1
,
src1_r
);
UNPCK_R_SH_SW
(
src2
,
src2_r
);
UNPCK_R_SH_SW
(
src3
,
src3_r
);
BUTTERFLY_4
(
src0_r
,
src2_r
,
src3_r
,
src1_r
,
tmp0
,
tmp3
,
tmp2
,
tmp1
);
BUTTERFLY_4
(
tmp0
,
tmp1
,
tmp2
,
tmp3
,
hor_res0
,
hor_res3
,
hor_res2
,
hor_res1
);
TRANSPOSE4x4_SW_SW
(
hor_res0
,
hor_res1
,
hor_res2
,
hor_res3
,
hor_res0
,
hor_res1
,
hor_res2
,
hor_res3
);
BUTTERFLY_4
(
hor_res0
,
hor_res2
,
hor_res3
,
hor_res1
,
tmp0
,
tmp3
,
tmp2
,
tmp1
);
BUTTERFLY_4
(
tmp0
,
tmp1
,
tmp2
,
tmp3
,
ver_res0_r
,
ver_res3_r
,
ver_res2_r
,
ver_res1_r
);
SRARI_W4_SW
(
ver_res0_r
,
ver_res1_r
,
ver_res2_r
,
ver_res3_r
,
1
);
PCKEV_H4_SH
(
ver_res0_r
,
ver_res0_r
,
ver_res1_r
,
ver_res1_r
,
ver_res2_r
,
ver_res2_r
,
ver_res3_r
,
ver_res3_r
,
ver_res0
,
ver_res1
,
ver_res2
,
ver_res3
);
PCKOD_D2_SH
(
ver_res1
,
ver_res0
,
ver_res3
,
ver_res2
,
ver_res0
,
ver_res2
);
ST_SH2
(
ver_res0
,
ver_res2
,
p_dst
,
8
);
}
static
void
avc_sub4x4_dct_msa
(
uint8_t
*
p_src
,
int32_t
i_src_stride
,
uint8_t
*
p_ref
,
int32_t
i_dst_stride
,
int16_t
*
p_dst
)
{
uint32_t
i_src0
,
i_src1
,
i_src2
,
i_src3
;
uint32_t
i_ref0
,
i_ref1
,
i_ref2
,
i_ref3
;
v16i8
src
=
{
0
};
v16i8
ref
=
{
0
};
v16u8
inp0
,
inp1
;
v8i16
diff0
,
diff1
,
diff2
,
diff3
;
v8i16
temp0
,
temp1
,
temp2
,
temp3
;
LW4
(
p_src
,
i_src_stride
,
i_src0
,
i_src1
,
i_src2
,
i_src3
);
LW4
(
p_ref
,
i_dst_stride
,
i_ref0
,
i_ref1
,
i_ref2
,
i_ref3
);
INSERT_W4_SB
(
i_src0
,
i_src1
,
i_src2
,
i_src3
,
src
);
INSERT_W4_SB
(
i_ref0
,
i_ref1
,
i_ref2
,
i_ref3
,
ref
);
ILVRL_B2_UB
(
src
,
ref
,
inp0
,
inp1
);
HSUB_UB2_SH
(
inp0
,
inp1
,
diff0
,
diff2
);
diff1
=
(
v8i16
)
__msa_ilvl_d
(
(
v2i64
)
diff0
,
(
v2i64
)
diff0
);
diff3
=
(
v8i16
)
__msa_ilvl_d
(
(
v2i64
)
diff2
,
(
v2i64
)
diff2
);
BUTTERFLY_4
(
diff0
,
diff1
,
diff2
,
diff3
,
temp0
,
temp1
,
temp2
,
temp3
);
diff0
=
temp0
+
temp1
;
diff1
=
(
temp3
<<
1
)
+
temp2
;
diff2
=
temp0
-
temp1
;
diff3
=
temp3
-
(
temp2
<<
1
);
TRANSPOSE4x4_SH_SH
(
diff0
,
diff1
,
diff2
,
diff3
,
temp0
,
temp1
,
temp2
,
temp3
);
BUTTERFLY_4
(
temp0
,
temp1
,
temp2
,
temp3
,
diff0
,
diff1
,
diff2
,
diff3
);
temp0
=
diff0
+
diff1
;
temp1
=
(
diff3
<<
1
)
+
diff2
;
temp2
=
diff0
-
diff1
;
temp3
=
diff3
-
(
diff2
<<
1
);
ILVR_D2_UB
(
temp1
,
temp0
,
temp3
,
temp2
,
inp0
,
inp1
);
ST_UB2
(
inp0
,
inp1
,
p_dst
,
8
);
}
static
void
avc_zigzag_scan_4x4_frame_msa
(
int16_t
pi_dct
[
16
],
int16_t
pi_level
[
16
]
)
{
v8i16
src0
,
src1
;
v8i16
mask0
=
{
0
,
4
,
1
,
2
,
5
,
8
,
12
,
9
};
v8i16
mask1
=
{
6
,
3
,
7
,
10
,
13
,
14
,
11
,
15
};
LD_SH2
(
pi_dct
,
8
,
src0
,
src1
);
VSHF_H2_SH
(
src0
,
src1
,
src0
,
src1
,
mask0
,
mask1
,
mask0
,
mask1
);
ST_SH2
(
mask0
,
mask1
,
pi_level
,
8
);
}
static
void
avc_idct4x4_addblk_msa
(
uint8_t
*
p_dst
,
int16_t
*
p_src
,
int32_t
i_dst_stride
)
{
v8i16
src0
,
src1
,
src2
,
src3
;
v8i16
hres0
,
hres1
,
hres2
,
hres3
;
v8i16
vres0
,
vres1
,
vres2
,
vres3
;
v8i16
zeros
=
{
0
};
LD4x4_SH
(
p_src
,
src0
,
src1
,
src2
,
src3
);
AVC_ITRANS_H
(
src0
,
src1
,
src2
,
src3
,
hres0
,
hres1
,
hres2
,
hres3
);
TRANSPOSE4x4_SH_SH
(
hres0
,
hres1
,
hres2
,
hres3
,
hres0
,
hres1
,
hres2
,
hres3
);
AVC_ITRANS_H
(
hres0
,
hres1
,
hres2
,
hres3
,
vres0
,
vres1
,
vres2
,
vres3
);
SRARI_H4_SH
(
vres0
,
vres1
,
vres2
,
vres3
,
6
);
ADDBLK_ST4x4_UB
(
vres0
,
vres1
,
vres2
,
vres3
,
p_dst
,
i_dst_stride
);
ST_SH2
(
zeros
,
zeros
,
p_src
,
8
);
}
static
void
avc_idct4x4_addblk_dc_msa
(
uint8_t
*
p_dst
,
int16_t
*
p_src
,
int32_t
i_dst_stride
)
{
int16_t
i_dc
;
uint32_t
i_src0
,
i_src1
,
i_src2
,
i_src3
;
v16u8
pred
=
{
0
};
v16i8
out
;
v8i16
input_dc
,
pred_r
,
pred_l
;
i_dc
=
(
p_src
[
0
]
+
32
)
>>
6
;
input_dc
=
__msa_fill_h
(
i_dc
);
p_src
[
0
]
=
0
;
LW4
(
p_dst
,
i_dst_stride
,
i_src0
,
i_src1
,
i_src2
,
i_src3
);
INSERT_W4_UB
(
i_src0
,
i_src1
,
i_src2
,
i_src3
,
pred
);
UNPCK_UB_SH
(
pred
,
pred_r
,
pred_l
);
pred_r
+=
input_dc
;
pred_l
+=
input_dc
;
CLIP_SH2_0_255
(
pred_r
,
pred_l
);
out
=
__msa_pckev_b
(
(
v16i8
)
pred_l
,
(
v16i8
)
pred_r
);
ST4x4_UB
(
out
,
out
,
0
,
1
,
2
,
3
,
p_dst
,
i_dst_stride
);
}
static
void
avc_idct8_addblk_msa
(
uint8_t
*
p_dst
,
int16_t
*
p_src
,
int32_t
i_dst_stride
)
{
v8i16
src0
,
src1
,
src2
,
src3
,
src4
,
src5
,
src6
,
src7
;
v8i16
vec0
,
vec1
,
vec2
,
vec3
;
v8i16
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
v8i16
res0
,
res1
,
res2
,
res3
,
res4
,
res5
,
res6
,
res7
;
v4i32
tmp0_r
,
tmp1_r
,
tmp2_r
,
tmp3_r
,
tmp4_r
,
tmp5_r
,
tmp6_r
,
tmp7_r
;
v4i32
tmp0_l
,
tmp1_l
,
tmp2_l
,
tmp3_l
,
tmp4_l
,
tmp5_l
,
tmp6_l
,
tmp7_l
;
v4i32
vec0_r
,
vec1_r
,
vec2_r
,
vec3_r
,
vec0_l
,
vec1_l
,
vec2_l
,
vec3_l
;
v4i32
res0_r
,
res1_r
,
res2_r
,
res3_r
,
res4_r
,
res5_r
,
res6_r
,
res7_r
;
v4i32
res0_l
,
res1_l
,
res2_l
,
res3_l
,
res4_l
,
res5_l
,
res6_l
,
res7_l
;
v16i8
dst0
,
dst1
,
dst2
,
dst3
,
dst4
,
dst5
,
dst6
,
dst7
;
v16i8
zeros
=
{
0
};
p_src
[
0
]
+=
32
;
LD_SH8
(
p_src
,
8
,
src0
,
src1
,
src2
,
src3
,
src4
,
src5
,
src6
,
src7
);
vec0
=
src0
+
src4
;
vec1
=
src0
-
src4
;
vec2
=
src2
>>
1
;
vec2
=
vec2
-
src6
;
vec3
=
src6
>>
1
;
vec3
=
src2
+
vec3
;
BUTTERFLY_4
(
vec0
,
vec1
,
vec2
,
vec3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
);
vec0
=
src7
>>
1
;
vec0
=
src5
-
vec0
-
src3
-
src7
;
vec1
=
src3
>>
1
;
vec1
=
src1
-
vec1
+
src7
-
src3
;
vec2
=
src5
>>
1
;
vec2
=
vec2
-
src1
+
src7
+
src5
;
vec3
=
src1
>>
1
;
vec3
=
vec3
+
src3
+
src5
+
src1
;
tmp4
=
vec3
>>
2
;
tmp4
+=
vec0
;
tmp5
=
vec2
>>
2
;
tmp5
+=
vec1
;
tmp6
=
vec1
>>
2
;
tmp6
-=
vec2
;
tmp7
=
vec0
>>
2
;
tmp7
=
vec3
-
tmp7
;
BUTTERFLY_8
(
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
,
res0
,
res1
,
res2
,
res3
,
res4
,
res5
,
res6
,
res7
);
TRANSPOSE8x8_SH_SH
(
res0
,
res1
,
res2
,
res3
,
res4
,
res5
,
res6
,
res7
,
res0
,
res1
,
res2
,
res3
,
res4
,
res5
,
res6
,
res7
);
UNPCK_SH_SW
(
res0
,
tmp0_r
,
tmp0_l
);
UNPCK_SH_SW
(
res1
,
tmp1_r
,
tmp1_l
);
UNPCK_SH_SW
(
res2
,
tmp2_r
,
tmp2_l
);
UNPCK_SH_SW
(
res3
,
tmp3_r
,
tmp3_l
);
UNPCK_SH_SW
(
res4
,
tmp4_r
,
tmp4_l
);
UNPCK_SH_SW
(
res5
,
tmp5_r
,
tmp5_l
);
UNPCK_SH_SW
(
res6
,
tmp6_r
,
tmp6_l
);
UNPCK_SH_SW
(
res7
,
tmp7_r
,
tmp7_l
);
BUTTERFLY_4
(
tmp0_r
,
tmp0_l
,
tmp4_l
,
tmp4_r
,
vec0_r
,
vec0_l
,
vec1_l
,
vec1_r
);
vec2_r
=
tmp2_r
>>
1
;
vec2_l
=
tmp2_l
>>
1
;
vec2_r
-=
tmp6_r
;
vec2_l
-=
tmp6_l
;
vec3_r
=
tmp6_r
>>
1
;
vec3_l
=
tmp6_l
>>
1
;
vec3_r
+=
tmp2_r
;
vec3_l
+=
tmp2_l
;
BUTTERFLY_4
(
vec0_r
,
vec1_r
,
vec2_r
,
vec3_r
,
tmp0_r
,
tmp2_r
,
tmp4_r
,
tmp6_r
);
BUTTERFLY_4
(
vec0_l
,
vec1_l
,
vec2_l
,
vec3_l
,
tmp0_l
,
tmp2_l
,
tmp4_l
,
tmp6_l
);
vec0_r
=
tmp7_r
>>
1
;
vec0_l
=
tmp7_l
>>
1
;
vec0_r
=
tmp5_r
-
vec0_r
-
tmp3_r
-
tmp7_r
;
vec0_l
=
tmp5_l
-
vec0_l
-
tmp3_l
-
tmp7_l
;
vec1_r
=
tmp3_r
>>
1
;
vec1_l
=
tmp3_l
>>
1
;
vec1_r
=
tmp1_r
-
vec1_r
+
tmp7_r
-
tmp3_r
;
vec1_l
=
tmp1_l
-
vec1_l
+
tmp7_l
-
tmp3_l
;
vec2_r
=
tmp5_r
>>
1
;
vec2_l
=
tmp5_l
>>
1
;
vec2_r
=
vec2_r
-
tmp1_r
+
tmp7_r
+
tmp5_r
;
vec2_l
=
vec2_l
-
tmp1_l
+
tmp7_l
+
tmp5_l
;
vec3_r
=
tmp1_r
>>
1
;
vec3_l
=
tmp1_l
>>
1
;
vec3_r
=
vec3_r
+
tmp3_r
+
tmp5_r
+
tmp1_r
;
vec3_l
=
vec3_l
+
tmp3_l
+
tmp5_l
+
tmp1_l
;
tmp1_r
=
vec3_r
>>
2
;
tmp1_l
=
vec3_l
>>
2
;
tmp1_r
+=
vec0_r
;
tmp1_l
+=
vec0_l
;
tmp3_r
=
vec2_r
>>
2
;
tmp3_l
=
vec2_l
>>
2
;
tmp3_r
+=
vec1_r
;
tmp3_l
+=
vec1_l
;
tmp5_r
=
vec1_r
>>
2
;
tmp5_l
=
vec1_l
>>
2
;
tmp5_r
-=
vec2_r
;
tmp5_l
-=
vec2_l
;
tmp7_r
=
vec0_r
>>
2
;
tmp7_l
=
vec0_l
>>
2
;
tmp7_r
=
vec3_r
-
tmp7_r
;
tmp7_l
=
vec3_l
-
tmp7_l
;
BUTTERFLY_4
(
tmp0_r
,
tmp0_l
,
tmp7_l
,
tmp7_r
,
res0_r
,
res0_l
,
res7_l
,
res7_r
);
BUTTERFLY_4
(
tmp2_r
,
tmp2_l
,
tmp5_l
,
tmp5_r
,
res1_r
,
res1_l
,
res6_l
,
res6_r
);
BUTTERFLY_4
(
tmp4_r
,
tmp4_l
,
tmp3_l
,
tmp3_r
,
res2_r
,
res2_l
,
res5_l
,
res5_r
);
BUTTERFLY_4
(
tmp6_r
,
tmp6_l
,
tmp1_l
,
tmp1_r
,
res3_r
,
res3_l
,
res4_l
,
res4_r
);
SRA_4V
(
res0_r
,
res0_l
,
res1_r
,
res1_l
,
6
);
SRA_4V
(
res2_r
,
res2_l
,
res3_r
,
res3_l
,
6
);
SRA_4V
(
res4_r
,
res4_l
,
res5_r
,
res5_l
,
6
);
SRA_4V
(
res6_r
,
res6_l
,
res7_r
,
res7_l
,
6
);
PCKEV_H4_SH
(
res0_l
,
res0_r
,
res1_l
,
res1_r
,
res2_l
,
res2_r
,
res3_l
,
res3_r
,
res0
,
res1
,
res2
,
res3
);
PCKEV_H4_SH
(
res4_l
,
res4_r
,
res5_l
,
res5_r
,
res6_l
,
res6_r
,
res7_l
,
res7_r
,
res4
,
res5
,
res6
,
res7
);
LD_SB8
(
p_dst
,
i_dst_stride
,
dst0
,
dst1
,
dst2
,
dst3
,
dst4
,
dst5
,
dst6
,
dst7
);
ILVR_B4_SH
(
zeros
,
dst0
,
zeros
,
dst1
,
zeros
,
dst2
,
zeros
,
dst3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
);
ILVR_B4_SH
(
zeros
,
dst4
,
zeros
,
dst5
,
zeros
,
dst6
,
zeros
,
dst7
,
tmp4
,
tmp5
,
tmp6
,
tmp7
);
ADD4
(
res0
,
tmp0
,
res1
,
tmp1
,
res2
,
tmp2
,
res3
,
tmp3
,
res0
,
res1
,
res2
,
res3
);
ADD4
(
res4
,
tmp4
,
res5
,
tmp5
,
res6
,
tmp6
,
res7
,
tmp7
,
res4
,
res5
,
res6
,
res7
);
CLIP_SH4_0_255
(
res0
,
res1
,
res2
,
res3
);
CLIP_SH4_0_255
(
res4
,
res5
,
res6
,
res7
);
PCKEV_B4_SB
(
res1
,
res0
,
res3
,
res2
,
res5
,
res4
,
res7
,
res6
,
dst0
,
dst1
,
dst2
,
dst3
);
ST8x4_UB
(
dst0
,
dst1
,
p_dst
,
i_dst_stride
);
p_dst
+=
(
4
*
i_dst_stride
);
ST8x4_UB
(
dst2
,
dst3
,
p_dst
,
i_dst_stride
);
}
static
void
avc_idct4x4dc_msa
(
int16_t
*
p_src
,
int32_t
i_src_stride
,
int16_t
*
p_dst
,
int32_t
i_dst_stride
)
{
v8i16
src0
,
src1
,
src2
,
src3
;
v4i32
src0_r
,
src1_r
,
src2_r
,
src3_r
;
v4i32
hres0
,
hres1
,
hres2
,
hres3
;
v8i16
vres0
,
vres1
,
vres2
,
vres3
;
v4i32
vec0
,
vec1
,
vec2
,
vec3
,
vec4
,
vec5
,
vec6
,
vec7
;
v2i64
res0
,
res1
;
LD_SH4
(
p_src
,
i_src_stride
,
src0
,
src1
,
src2
,
src3
);
UNPCK_R_SH_SW
(
src0
,
src0_r
);
UNPCK_R_SH_SW
(
src1
,
src1_r
);
UNPCK_R_SH_SW
(
src2
,
src2_r
);
UNPCK_R_SH_SW
(
src3
,
src3_r
);
BUTTERFLY_4
(
src0_r
,
src2_r
,
src3_r
,
src1_r
,
vec0
,
vec3
,
vec2
,
vec1
);
BUTTERFLY_4
(
vec0
,
vec1
,
vec2
,
vec3
,
hres0
,
hres3
,
hres2
,
hres1
);
TRANSPOSE4x4_SW_SW
(
hres0
,
hres1
,
hres2
,
hres3
,
hres0
,
hres1
,
hres2
,
hres3
);
BUTTERFLY_4
(
hres0
,
hres2
,
hres3
,
hres1
,
vec0
,
vec3
,
vec2
,
vec1
);
BUTTERFLY_4
(
vec0
,
vec1
,
vec2
,
vec3
,
vec4
,
vec7
,
vec6
,
vec5
);
PCKEV_H4_SH
(
vec4
,
vec4
,
vec5
,
vec5
,
vec6
,
vec6
,
vec7
,
vec7
,
vres0
,
vres1
,
vres2
,
vres3
);
PCKOD_D2_SD
(
vres1
,
vres0
,
vres3
,
vres2
,
res0
,
res1
);
ST8x4_UB
(
res0
,
res1
,
p_dst
,
i_dst_stride
*
2
);
}
static
int32_t
subtract_sum4x4_msa
(
uint8_t
*
p_src
,
int32_t
i_src_stride
,
uint8_t
*
pred_ptr
,
int32_t
i_pred_stride
)
{
int16_t
i_sum
;
uint32_t
i_src0
,
i_src1
,
i_src2
,
i_src3
;
uint32_t
i_pred0
,
i_pred1
,
i_pred2
,
i_pred3
;
v16i8
src
=
{
0
};
v16i8
pred
=
{
0
};
v16u8
src_l0
,
src_l1
;
v8i16
diff0
,
diff1
;
LW4
(
p_src
,
i_src_stride
,
i_src0
,
i_src1
,
i_src2
,
i_src3
);
LW4
(
pred_ptr
,
i_pred_stride
,
i_pred0
,
i_pred1
,
i_pred2
,
i_pred3
);
INSERT_W4_SB
(
i_src0
,
i_src1
,
i_src2
,
i_src3
,
src
);
INSERT_W4_SB
(
i_pred0
,
i_pred1
,
i_pred2
,
i_pred3
,
pred
);
ILVRL_B2_UB
(
src
,
pred
,
src_l0
,
src_l1
);
HSUB_UB2_SH
(
src_l0
,
src_l1
,
diff0
,
diff1
);
i_sum
=
HADD_UH_U32
(
diff0
+
diff1
);
return
i_sum
;
}
void
x264_dct4x4dc_msa
(
int16_t
d
[
16
]
)
{
avc_dct4x4dc_msa
(
d
,
d
,
4
);
}
void
x264_idct4x4dc_msa
(
int16_t
d
[
16
]
)
{
avc_idct4x4dc_msa
(
d
,
4
,
d
,
4
);
}
void
x264_add4x4_idct_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
16
]
)
{
avc_idct4x4_addblk_msa
(
p_dst
,
pi_dct
,
FDEC_STRIDE
);
}
void
x264_add8x8_idct_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
4
][
16
]
)
{
avc_idct4x4_addblk_msa
(
&
p_dst
[
0
],
&
pi_dct
[
0
][
0
],
FDEC_STRIDE
);
avc_idct4x4_addblk_msa
(
&
p_dst
[
4
],
&
pi_dct
[
1
][
0
],
FDEC_STRIDE
);
avc_idct4x4_addblk_msa
(
&
p_dst
[
4
*
FDEC_STRIDE
+
0
],
&
pi_dct
[
2
][
0
],
FDEC_STRIDE
);
avc_idct4x4_addblk_msa
(
&
p_dst
[
4
*
FDEC_STRIDE
+
4
],
&
pi_dct
[
3
][
0
],
FDEC_STRIDE
);
}
void
x264_add16x16_idct_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
16
][
16
]
)
{
x264_add8x8_idct_msa
(
&
p_dst
[
0
],
&
pi_dct
[
0
]
);
x264_add8x8_idct_msa
(
&
p_dst
[
8
],
&
pi_dct
[
4
]
);
x264_add8x8_idct_msa
(
&
p_dst
[
8
*
FDEC_STRIDE
+
0
],
&
pi_dct
[
8
]
);
x264_add8x8_idct_msa
(
&
p_dst
[
8
*
FDEC_STRIDE
+
8
],
&
pi_dct
[
12
]
);
}
void
x264_add8x8_idct8_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
64
]
)
{
avc_idct8_addblk_msa
(
p_dst
,
pi_dct
,
FDEC_STRIDE
);
}
void
x264_add16x16_idct8_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
4
][
64
]
)
{
avc_idct8_addblk_msa
(
&
p_dst
[
0
],
&
pi_dct
[
0
][
0
],
FDEC_STRIDE
);
avc_idct8_addblk_msa
(
&
p_dst
[
8
],
&
pi_dct
[
1
][
0
],
FDEC_STRIDE
);
avc_idct8_addblk_msa
(
&
p_dst
[
8
*
FDEC_STRIDE
+
0
],
&
pi_dct
[
2
][
0
],
FDEC_STRIDE
);
avc_idct8_addblk_msa
(
&
p_dst
[
8
*
FDEC_STRIDE
+
8
],
&
pi_dct
[
3
][
0
],
FDEC_STRIDE
);
}
void
x264_add8x8_idct_dc_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
4
]
)
{
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
0
],
&
pi_dct
[
0
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
4
],
&
pi_dct
[
1
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
4
*
FDEC_STRIDE
+
0
],
&
pi_dct
[
2
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
4
*
FDEC_STRIDE
+
4
],
&
pi_dct
[
3
],
FDEC_STRIDE
);
}
void
x264_add16x16_idct_dc_msa
(
uint8_t
*
p_dst
,
int16_t
pi_dct
[
16
]
)
{
for
(
int32_t
i
=
0
;
i
<
4
;
i
++
,
pi_dct
+=
4
,
p_dst
+=
4
*
FDEC_STRIDE
)
{
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
0
],
&
pi_dct
[
0
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
4
],
&
pi_dct
[
1
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
8
],
&
pi_dct
[
2
],
FDEC_STRIDE
);
avc_idct4x4_addblk_dc_msa
(
&
p_dst
[
12
],
&
pi_dct
[
3
],
FDEC_STRIDE
);
}
}
void
x264_sub4x4_dct_msa
(
int16_t
p_dst
[
16
],
uint8_t
*
p_src
,
uint8_t
*
p_ref
)
{
avc_sub4x4_dct_msa
(
p_src
,
FENC_STRIDE
,
p_ref
,
FDEC_STRIDE
,
p_dst
);
}
void
x264_sub8x8_dct_msa
(
int16_t
p_dst
[
4
][
16
],
uint8_t
*
p_src
,
uint8_t
*
p_ref
)
{
avc_sub4x4_dct_msa
(
&
p_src
[
0
],
FENC_STRIDE
,
&
p_ref
[
0
],
FDEC_STRIDE
,
p_dst
[
0
]
);
avc_sub4x4_dct_msa
(
&
p_src
[
4
],
FENC_STRIDE
,
&
p_ref
[
4
],
FDEC_STRIDE
,
p_dst
[
1
]
);
avc_sub4x4_dct_msa
(
&
p_src
[
4
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_ref
[
4
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
,
p_dst
[
2
]
);
avc_sub4x4_dct_msa
(
&
p_src
[
4
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_ref
[
4
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
,
p_dst
[
3
]
);
}
void
x264_sub16x16_dct_msa
(
int16_t
p_dst
[
16
][
16
],
uint8_t
*
p_src
,
uint8_t
*
p_ref
)
{
x264_sub8x8_dct_msa
(
&
p_dst
[
0
],
&
p_src
[
0
],
&
p_ref
[
0
]
);
x264_sub8x8_dct_msa
(
&
p_dst
[
4
],
&
p_src
[
8
],
&
p_ref
[
8
]
);
x264_sub8x8_dct_msa
(
&
p_dst
[
8
],
&
p_src
[
8
*
FENC_STRIDE
+
0
],
&
p_ref
[
8
*
FDEC_STRIDE
+
0
]
);
x264_sub8x8_dct_msa
(
&
p_dst
[
12
],
&
p_src
[
8
*
FENC_STRIDE
+
8
],
&
p_ref
[
8
*
FDEC_STRIDE
+
8
]
);
}
void
x264_sub8x8_dct_dc_msa
(
int16_t
pi_dct
[
4
],
uint8_t
*
p_pix1
,
uint8_t
*
p_pix2
)
{
int32_t
d0
,
d1
,
d2
,
d3
;
pi_dct
[
0
]
=
subtract_sum4x4_msa
(
&
p_pix1
[
0
],
FENC_STRIDE
,
&
p_pix2
[
0
],
FDEC_STRIDE
);
pi_dct
[
1
]
=
subtract_sum4x4_msa
(
&
p_pix1
[
4
],
FENC_STRIDE
,
&
p_pix2
[
4
],
FDEC_STRIDE
);
pi_dct
[
2
]
=
subtract_sum4x4_msa
(
&
p_pix1
[
4
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_pix2
[
4
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
);
pi_dct
[
3
]
=
subtract_sum4x4_msa
(
&
p_pix1
[
4
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_pix2
[
4
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
);
BUTTERFLY_4
(
pi_dct
[
0
],
pi_dct
[
2
],
pi_dct
[
3
],
pi_dct
[
1
],
d0
,
d1
,
d3
,
d2
);
BUTTERFLY_4
(
d0
,
d2
,
d3
,
d1
,
pi_dct
[
0
],
pi_dct
[
2
],
pi_dct
[
3
],
pi_dct
[
1
]
);
}
void
x264_sub8x16_dct_dc_msa
(
int16_t
pi_dct
[
8
],
uint8_t
*
p_pix1
,
uint8_t
*
p_pix2
)
{
int32_t
a0
,
a1
,
a2
,
a3
,
a4
,
a5
,
a6
,
a7
;
int32_t
b0
,
b1
,
b2
,
b3
,
b4
,
b5
,
b6
,
b7
;
a0
=
subtract_sum4x4_msa
(
&
p_pix1
[
0
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_pix2
[
0
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
);
a1
=
subtract_sum4x4_msa
(
&
p_pix1
[
0
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_pix2
[
0
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
);
a2
=
subtract_sum4x4_msa
(
&
p_pix1
[
4
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_pix2
[
4
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
);
a3
=
subtract_sum4x4_msa
(
&
p_pix1
[
4
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_pix2
[
4
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
);
a4
=
subtract_sum4x4_msa
(
&
p_pix1
[
8
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_pix2
[
8
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
);
a5
=
subtract_sum4x4_msa
(
&
p_pix1
[
8
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_pix2
[
8
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
);
a6
=
subtract_sum4x4_msa
(
&
p_pix1
[
12
*
FENC_STRIDE
+
0
],
FENC_STRIDE
,
&
p_pix2
[
12
*
FDEC_STRIDE
+
0
],
FDEC_STRIDE
);
a7
=
subtract_sum4x4_msa
(
&
p_pix1
[
12
*
FENC_STRIDE
+
4
],
FENC_STRIDE
,
&
p_pix2
[
12
*
FDEC_STRIDE
+
4
],
FDEC_STRIDE
);
BUTTERFLY_8
(
a0
,
a2
,
a4
,
a6
,
a7
,
a5
,
a3
,
a1
,
b0
,
b1
,
b2
,
b3
,
b7
,
b6
,
b5
,
b4
);
BUTTERFLY_8
(
b0
,
b2
,
b4
,
b6
,
b7
,
b5
,
b3
,
b1
,
a0
,
a1
,
a2
,
a3
,
a7
,
a6
,
a5
,
a4
);
BUTTERFLY_8
(
a0
,
a2
,
a4
,
a6
,
a7
,
a5
,
a3
,
a1
,
pi_dct
[
0
],
pi_dct
[
1
],
pi_dct
[
6
],
pi_dct
[
7
],
pi_dct
[
5
],
pi_dct
[
4
],
pi_dct
[
3
],
pi_dct
[
2
]
);
}
void
x264_zigzag_scan_4x4_frame_msa
(
int16_t
pi_level
[
16
],
int16_t
pi_dct
[
16
]
)
{
avc_zigzag_scan_4x4_frame_msa
(
pi_dct
,
pi_level
);
}
#endif
common/mips/dct.h
0 → 100644
View file @
57618eea
/*****************************************************************************
* dct.h: msa transform and zigzag
*****************************************************************************
* Copyright (C) 2015 x264 project
*
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or