Commit aa5a3293 authored by Daniel Kang's avatar Daniel Kang Committed by Fiona Glaser

SSE versions of some high-bit-depth DCT functions

Our first Google Code-In patch!
parent 00524dfa
......@@ -429,6 +429,12 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
}
if( cpu&X264_CPU_SSE2 )
{
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......
......@@ -50,6 +50,7 @@ const pw_3fff, times 8 dw 0x3fff
const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
const pd_128, times 4 dd 128
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
......
......@@ -38,13 +38,13 @@ cextern hsub_mul
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
SUMSUB_BA m%8, m%1 ; %8 = s07, %1 = d07
SUMSUB_BA m%7, m%2 ; %7 = s16, %2 = d16
SUMSUB_BA m%6, m%3 ; %6 = s25, %3 = d25
SUMSUB_BA m%5, m%4 ; %5 = s34, %4 = d34
SUMSUB_BA m%5, m%8 ; %5 = a0, %8 = a2
SUMSUB_BA m%6, m%7 ; %6 = a1, %7 = a3
SUMSUB_BA m%6, m%5 ; %6 = dst0, %5 = dst4
SUMSUB_BA w, m%8, m%1 ; %8 = s07, %1 = d07
SUMSUB_BA w, m%7, m%2 ; %7 = s16, %2 = d16
SUMSUB_BA w, m%6, m%3 ; %6 = s25, %3 = d25
SUMSUB_BA w, m%5, m%4 ; %5 = s34, %4 = d34
SUMSUB_BA w, m%5, m%8 ; %5 = a0, %8 = a2
SUMSUB_BA w, m%6, m%7 ; %6 = a1, %7 = a3
SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
mova [%9+0x00], m%6
mova [%9+0x40], m%5
mova m%6, m%7 ; a3
......@@ -127,13 +127,13 @@ cextern hsub_mul
psubw m%2, m%1
mova m%1, [%9+0x00]
mova m%6, [%9+0x40]
SUMSUB_BA m%6, m%1
SUMSUB_BA m%7, m%6
SUMSUB_BA m%3, m%1
SUMSUB_BA m%5, m%7
SUMSUB_BA m%2, m%3
SUMSUB_BA m%8, m%1
SUMSUB_BA m%4, m%6
SUMSUB_BA w, m%6, m%1
SUMSUB_BA w, m%7, m%6
SUMSUB_BA w, m%3, m%1
SUMSUB_BA w, m%5, m%7
SUMSUB_BA w, m%2, m%3
SUMSUB_BA w, m%8, m%1
SUMSUB_BA w, m%4, m%6
SWAP %1, %5, %6
SWAP %3, %8, %7
%endmacro
......@@ -434,18 +434,18 @@ global add8x8_idct_sse2.skip_prologue
SBUTTERFLY qdq, 4, 5, 0
SBUTTERFLY qdq, 6, 7, 0
UNSPILL r1,0
IDCT4_1D 0,1,2,3,r1
IDCT4_1D w,0,1,2,3,r1
SPILL r1, 4
TRANSPOSE2x4x4W 0,1,2,3,4
UNSPILL r1, 4
IDCT4_1D 4,5,6,7,r1
IDCT4_1D w,4,5,6,7,r1
SPILL r1, 0
TRANSPOSE2x4x4W 4,5,6,7,0
UNSPILL r1, 0
paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,r1
IDCT4_1D w,0,1,2,3,r1
paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,r1
IDCT4_1D w,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
......
......@@ -36,13 +36,13 @@ cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
movdqa m%9, m%1
psraw m%9, 1
......@@ -56,7 +56,7 @@ INIT_XMM
paddw m%10, m%2
psubw m%10, m%3 ; %10=a7
SUMSUB_BA m%4, m%1
SUMSUB_BA w, m%4, m%1
psubw m%1, m%3
psubw m%4, m%2
psraw m%3, 1
......@@ -70,7 +70,7 @@ INIT_XMM
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
movdqa m%3, m%7
psraw m%3, 1
......@@ -88,7 +88,7 @@ INIT_XMM
%endmacro
%macro IDCT8_1D 10
SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
movdqa m%9, m%2
psraw m%9, 1
......@@ -123,8 +123,8 @@ INIT_XMM
psraw m%6, 2
psubw m%9, m%6 ; %9=b7
SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
SUMSUB_BA w, m%3, m%1, m%6 ; %3=b2, %1=b4
movdqa m%8, m%10
psraw m%8, 2
......@@ -132,10 +132,10 @@ INIT_XMM
psraw m%2, 2
psubw m%2, m%10 ; %2=b5
SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
SWAP %1, %9, %6
SWAP %3, %8, %7
......@@ -263,14 +263,14 @@ global add8x8_idct_sse2.skip_prologue
mova m7, [r1+112]
SBUTTERFLY qdq, 4, 5, 8
SBUTTERFLY qdq, 6, 7, 8
IDCT4_1D 0,1,2,3,8,10
IDCT4_1D w,0,1,2,3,8,10
TRANSPOSE2x4x4W 0,1,2,3,8
IDCT4_1D 4,5,6,7,8,10
IDCT4_1D w,4,5,6,7,8,10
TRANSPOSE2x4x4W 4,5,6,7,8
paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,8,10
IDCT4_1D w,0,1,2,3,8,10
paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,8,10
IDCT4_1D w,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
......
......@@ -52,14 +52,17 @@ SECTION .text
cextern pw_32_0
cextern pw_32
cextern pw_8000
cextern pw_pixel_max
cextern hsub_mul
cextern pb_1
cextern pw_1
cextern pd_1
cextern pd_32
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
SWAP %1, %4, %3
%macro WALSH4_1D 6
SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%6
SUMSUB_BADC %1, m%5, m%3, m%4, m%2, m%6
SWAP %2, %5, %4
%endmacro
%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
......@@ -74,19 +77,41 @@ cextern pw_1
SWAP %1, %2, %3
%endmacro
INIT_MMX
%ifdef HIGH_BIT_DEPTH
INIT_XMM
;-----------------------------------------------------------------------------
; void dct4x4dc( int16_t d[4][4] )
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
cglobal dct4x4dc_sse2, 1,1,5
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
mova m3, [r0+48]
WALSH4_1D d, 0,1,2,3,4
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_1]
WALSH4_1D d, 0,1,2,3,4
psrad m0, 1
psrad m1, 1
psrad m2, 1
psrad m3, 1
mova [r0+ 0], m0
mova [r0+16], m1
mova [r0+32], m2
mova [r0+48], m3
RET
%else
INIT_MMX
cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D 0,1,2,3,4
WALSH4_1D w, 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC m1, m0, m3, m2, m4
SUMSUB_BADC w, m1, m0, m3, m2, m4
SWAP 0, 1
SWAP 2, 3
SUMSUB_17BIT 0,2,4,7
......@@ -96,7 +121,29 @@ cglobal dct4x4dc_mmx, 1,1
movq [r0+16], m3
movq [r0+24], m1
RET
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal idct4x4dc_sse2, 1,1
mova m3, [r0+48]
mova m2, [r0+32]
mova m1, [r0+16]
mova m0, [r0+ 0]
WALSH4_1D d,0,1,2,3,4
TRANSPOSE4x4D 0,1,2,3,4
WALSH4_1D d,0,1,2,3,4
mova [r0+ 0], m0
mova [r0+16], m1
mova [r0+32], m2
mova [r0+48], m3
RET
%else
INIT_MMX
;-----------------------------------------------------------------------------
; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
......@@ -105,18 +152,20 @@ cglobal idct4x4dc_mmx, 1,1
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
WALSH4_1D 0,1,2,3,4
WALSH4_1D w,0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
WALSH4_1D 0,1,2,3,4
WALSH4_1D w,0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( int32_t dct[4][4], uint16_t *pix1, uint16_t *pix2 )
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
cglobal sub4x4_dct_mmx, 3,3
.skip_prologue:
......@@ -132,13 +181,9 @@ cglobal sub4x4_dct_mmx, 3,3
STORE_DIFF m2, m4, m5, [r0+32], [r0+40]
STORE_DIFF m3, m4, m5, [r0+48], [r0+56]
RET
%endif ; HIGH_BIT_DEPTH
%else
%ifndef HIGH_BIT_DEPTH
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
.skip_prologue:
......@@ -162,12 +207,42 @@ cglobal sub4x4_dct_%1, 3,3
SUB_DCT4 mmx
SUB_DCT4 ssse3
%endif ; !HIGH_BIT_DEPTH
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
%macro STORE_DIFFx2 6
psrad %1, 6
psrad %2, 6
packssdw %1, %2
movq %3, %5
movhps %3, %6
paddsw %1, %3
pxor %4, %4
CLIPW %1, %4, [pw_pixel_max]
movq %5, %1
movhps %6, %1
%endmacro
INIT_XMM
cglobal add4x4_idct_sse2, 2,2,7
pxor m6, m6
.skip_prologue:
mova m1, [r1+16]
mova m3, [r1+48]
mova m2, [r1+32]
mova m0, [r1+ 0]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
STORE_DIFFx2 m0, m1, m4, m6, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
STORE_DIFFx2 m2, m3, m4, m6, [r0+4*FDEC_STRIDE], [r0+6*FDEC_STRIDE]
RET
%else
cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
.skip_prologue:
......@@ -175,10 +250,10 @@ cglobal add4x4_idct_mmx, 2,2
movq m3, [r1+24]
movq m2, [r1+16]
movq m0, [r1+ 0]
IDCT4_1D 0,1,2,3,4,5
IDCT4_1D w,0,1,2,3,4,5
TRANSPOSE4x4W 0,1,2,3,4
paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,4,5
IDCT4_1D w,0,1,2,3,4,5
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
......@@ -198,7 +273,7 @@ cglobal add4x4_idct_sse4, 2,2,6
psubw m0, m3 ; row1>>1-row3/row0-2
paddw m2, m1 ; row3>>1+row1/row0+2
SBUTTERFLY2 wd, 0, 2, 1
SUMSUB_BA m2, m0, m1
SUMSUB_BA w, m2, m0, m1
pshuflw m1, m2, 10110001b
pshufhw m2, m2, 10110001b
punpckldq m1, m0
......@@ -215,7 +290,7 @@ cglobal add4x4_idct_sse4, 2,2,6
psubw m0, m3 ; row1>>1-row3/row0-2
paddw m2, m1 ; row3>>1+row1/row0+2
SBUTTERFLY2 qdq, 0, 2, 1
SUMSUB_BA m2, m0, m1
SUMSUB_BA w, m2, m0, m1
movd m4, [r0+FDEC_STRIDE*0]
movd m1, [r0+FDEC_STRIDE*1]
......@@ -236,7 +311,7 @@ cglobal add4x4_idct_sse4, 2,2,6
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
%endif ; !HIGH_BIT_DEPTH
%endif ; HIGH_BIT_DEPTH
INIT_MMX
;-----------------------------------------------------------------------------
......
......@@ -40,6 +40,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix
void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
......@@ -52,7 +53,9 @@ void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
void x264_dct4x4dc_mmx ( int16_t d[16] );
void x264_dct4x4dc_sse2 ( int32_t d[16] );
void x264_idct4x4dc_mmx ( int16_t d[16] );
void x264_idct4x4dc_sse2 ( int32_t d[16] );
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
......
......@@ -39,7 +39,6 @@ hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
pd_16: times 4 dd 16
pd_32: times 4 dd 32
pd_0f: times 4 dd 0xffff
pad10: times 8 dw 10*PIXEL_MAX
......
......@@ -881,7 +881,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
DEINTB %1, %2, %3, %4, %5
psubw m%1, m%3
psubw m%2, m%4
SUMSUB_BA m%1, m%2, m%3
SUMSUB_BA w, m%1, m%2, m%3
%endmacro
%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
......@@ -1278,10 +1278,10 @@ cglobal pixel_sa8d_8x8_internal_%1
%else ; non-sse2
HADAMARD4_V m0, m1, m2, m8, m6
HADAMARD4_V m4, m5, m3, m9, m6
SUMSUB_BADC m0, m4, m1, m5, m6
SUMSUB_BADC w, m0, m4, m1, m5, m6
HADAMARD 2, sumsub, 0, 4, 6, 11
HADAMARD 2, sumsub, 1, 5, 6, 11
SUMSUB_BADC m2, m3, m8, m9, m6
SUMSUB_BADC w, m2, m3, m8, m9, m6
HADAMARD 2, sumsub, 2, 3, 6, 11
HADAMARD 2, sumsub, 8, 9, 6, 11
HADAMARD 1, amax, 0, 4, 6, 11
......@@ -1379,7 +1379,7 @@ cglobal pixel_sa8d_8x8_internal_%1
mova spill0, m6
mova spill1, m7
HADAMARD4_V m0, m1, m2, m3, m7
SUMSUB_BADC m0, m4, m1, m5, m7
SUMSUB_BADC w, m0, m4, m1, m5, m7
HADAMARD 2, sumsub, 0, 4, 7, 6
HADAMARD 2, sumsub, 1, 5, 7, 6
HADAMARD 1, amax, 0, 4, 7, 6
......@@ -1387,7 +1387,7 @@ cglobal pixel_sa8d_8x8_internal_%1
mova m6, spill0
mova m7, spill1
paddw m0, m1
SUMSUB_BADC m2, m6, m3, m7, m4
SUMSUB_BADC w, m2, m6, m3, m7, m4
HADAMARD 2, sumsub, 2, 6, 4, 5
HADAMARD 2, sumsub, 3, 7, 4, 5
HADAMARD 1, amax, 2, 6, 4, 5
......@@ -1994,7 +1994,7 @@ cglobal hadamard_ac_2x2max_mmxext
mova m2, [r3+0x40]
mova m3, [r3+0x60]
sub r3, 8
SUMSUB_BADC m0, m1, m2, m3, m4
SUMSUB_BADC w, m0, m1, m2, m3, m4
ABS4 m0, m2, m1, m3, m4, m5
HADAMARD 0, max, 0, 2, 4, 5
HADAMARD 0, max, 1, 3, 4, 5
......@@ -2059,7 +2059,7 @@ cglobal hadamard_ac_8x8_mmxext
mova m1, [r3+0x20]
mova m2, [r3+0x40]
mova m3, [r3+0x60]
SUMSUB_BADC m0, m1, m2, m3, m4
SUMSUB_BADC w, m0, m1, m2, m3, m4
HADAMARD 0, sumsub, 0, 2, 4, 5
ABS4 m1, m3, m0, m2, m4, m5
HADAMARD 0, max, 1, 3, 4, 5
......@@ -2243,9 +2243,6 @@ cglobal hadamard_ac_8x8_%1
HADAMARD4_2D_SSE 4, 5, 6, 7, 1
%else
HADAMARD4_V m4, m5, m6, m7, m1
%endif
%if vertical == 0
mova m1, spill0
mova spill0, m6
mova spill1, m7
......@@ -2259,14 +2256,13 @@ cglobal hadamard_ac_8x8_%1
HADAMARD 1, sumsub, 6, 7, 1, 0
mova m0, spill1
%endif
mova spill1, m2
mova spill2, m3
ABS_MOV m1, m0
ABS_MOV m2, m4
ABS_MOV m3, m5
paddw m1, m2
SUMSUB_BA m0, m4; m2
SUMSUB_BA w, m0, m4
%if vertical
pand m1, [mask_ac4]
%else
......
......@@ -241,44 +241,44 @@
psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 2-3
%if %0==2
paddw %1, %2
paddw %2, %2
psubw %2, %1
%macro SUMSUB_BA 3-4
%if %0==3
padd%1 %2, %3
padd%1 %3, %3
psub%1 %3, %2
%else
mova %3, %1
paddw %1, %2
psubw %2, %3
mova %4, %2
padd%1 %2, %3
psub%1 %3, %4
%endif
%endmacro
%macro SUMSUB_BADC 4-5
%if %0==5
SUMSUB_BA %1, %2, %5
SUMSUB_BA %3, %4, %5
%macro SUMSUB_BADC 5-6
%if %0==6
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
padd%1 %2, %3
padd%1 %4, %5
padd%1 %3, %3
padd%1 %5, %5
psub%1 %3, %2
psub%1 %5, %4
%endif
%endmacro
%macro HADAMARD4_V 4+
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %1, %3, %2, %4
%endmacro
%macro HADAMARD8_V 8+
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %5, %6, %7, %8
SUMSUB_BADC w, %1, %3, %2, %4
SUMSUB_BADC w, %5, %7, %6, %8
SUMSUB_BADC w, %1, %5, %2, %6
SUMSUB_BADC w, %3, %7, %4, %8
%endmacro
%macro TRANS_SSE2 5-6
......@@ -363,7 +363,7 @@
%endif
%endif
%ifidn %2, sumsub
SUMSUB_BA m%3, m%4, m%5
SUMSUB_BA w, m%3, m%4, m%5
%else
%ifidn %2, amax
%if %0==6
......@@ -426,67 +426,71 @@
%endif
%endmacro
%macro SUMSUB2_AB 3
mova %3, %1
paddw %1, %1
paddw %1, %2
psubw %3, %2
psubw %3, %2
%macro SUMSUB2_AB 4
mova %4, %2
padd%1 %2, %2
padd%1 %2, %3
psub%1 %4, %3
psub%1 %4, %3
%endmacro
%macro SUMSUB2_BA 3
mova m%3, m%1
paddw m%1, m%2
paddw m%1, m%2
psubw m%2, m%3
psubw m%2, m%3
%macro SUMSUB2_BA 4
mova m%4, m%2
padd%1 m%2, m%3
padd%1 m%2, m%3
psub%1 m%3, m%4
psub%1 m%3, m%4
%endmacro
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
psraw %2, 1 ; %2: %2>>1
psraw %1, 1 ; %1: %1>>1
paddw %2, %4 ; %2: %2>>1+%1
psubw %1, %3 ; %1: %1>>1-%2
%macro SUMSUBD2_AB 5
mova %5, %2
mova %4, %3
psra%1 %3, 1 ; %3: %3>>1
psra%1 %2, 1 ; %2: %2>>1
padd%1 %3, %5 ; %3: %3>>1+%2
psub%1 %2, %4 ; %2: %2>>1-%3
%endmacro
%macro DCT4_1D 5
%ifnum %5
SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
SUMSUB_BA m%3, m%4, m%5
SUMSUB2_AB m%1, m%2, m%5
SUMSUB_BADC w, m%4, m%1, m%3, m%2, m%5
SUMSUB_BA w, m%3, m%4, m%5
SUMSUB2_AB w, m%1, m%2, m%5
SWAP %1, %3, %4, %5, %2
%else
SUMSUB_BADC m%4, m%1, m%3, m%2
SUMSUB_BA m%3, m%4
mova [%5], m%2
SUMSUB2_AB m%1, [%5], m%2
SUMSUB_BADC w, m%4, m%1, m%3, m%2
SUMSUB_BA w, m%3, m%4
mova [%5], m%2
SUMSUB2_AB w, m%1, [%5], m%2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 5-6
%ifnum %5
SUMSUBD2_AB m%2, m%4, m%6, m%5
; %2: %2>>1-%4 %4: %2+%4>>1
SUMSUB_BA m%3, m%1, m%6
; %3: %1+%3 %1: %1-%3
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
; %4: %1+%3 + (%2+%4>>1)
; %3: %1+%3 - (%2+%4>>1)
; %2: %1-%3 + (%2>>1-%4)
; %1: %1-%3 - (%2>>1-%4)
%macro IDCT4_1D 6-7
%ifnum %6
SUMSUBD2_AB %1, m%3, m%5, m%7, m%6
; %3: %3>>1-%5 %5: %3+%5>>1
SUMSUB_BA %1, m%4, m%2, m%7
; %4: %2+%4 %2: %2-%4
SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%7
; %5: %2+%4 + (%3+%5>>1)
; %4: %2+%4 - (%3+%5>>1)
; %3: %2-%4 + (%3>>1-%5)
; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+16]
%else
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
SUMSUB_BA m%3, m%1
SUMSUB_BADC m%4, m%3, m%2, m%1
SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+32]