Commit f85be1cd authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

asm cosmetics: INIT_MMX/XMM/YMM now support a cpuflags argument

Reduces the number of macro args that need to be passed around.
Allows multiple implementations of a given macro (e.g. PALIGNR) to check
cpuflags at the location where the macro is defined, instead of having
to select implementations by %define at toplevel.
Remove INIT_AVX, as it's replaced by "INIT_XMM avx".

This commit does not change the stripped executable.
parent 67336688
......@@ -62,9 +62,9 @@ ALIGN 16
jl %1
%endmacro
%macro NAL_ESCAPE 1
%macro NAL_ESCAPE 0
cglobal nal_escape_%1, 3,5
cglobal nal_escape, 3,5
mov r3w, [r1]
sub r1, r2 ; r1 = offset of current src pointer from end of src
pxor m4, m4
......@@ -123,9 +123,9 @@ ALIGN 16
jmp .no_escape
%endmacro
INIT_MMX
NAL_ESCAPE mmx2
INIT_XMM
NAL_ESCAPE sse2
INIT_AVX
NAL_ESCAPE avx
INIT_MMX mmx2
NAL_ESCAPE
INIT_XMM sse2
NAL_ESCAPE
INIT_XMM avx
NAL_ESCAPE
......@@ -141,11 +141,9 @@ load_diff_4x8_mmx:
movq m0, [r0]
ret
INIT_MMX
ALIGN 16
dct8_mmx:
cglobal dct8_mmx
DCT8_1D 0,1,2,3,4,5,6,7,r0
SAVE_MM_PERMUTATION dct8_mmx
SAVE_MM_PERMUTATION
ret
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
......@@ -182,7 +180,7 @@ dct8_mmx:
cglobal sub8x8_dct8_mmx, 3,3
global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
RESET_MM_PERMUTATION
call load_diff_4x8_mmx
call dct8_mmx
UNSPILL r0, 0
......@@ -191,7 +189,7 @@ global sub8x8_dct8_mmx.skip_prologue
UNSPILL r0, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0, 4,5,6,7
INIT_MMX
RESET_MM_PERMUTATION
add r1, 4
add r2, 4
add r0, 8
......@@ -212,23 +210,21 @@ global sub8x8_dct8_mmx.skip_prologue
movq mm1, m5
movq mm2, mm4
movq mm3, m7
INIT_MMX
RESET_MM_PERMUTATION
UNSPILL r0+8, 4,5,6,7
add r0, 8
call dct8_mmx
sub r0, 8
SPILL r0+8, 1,2,3,5,7
INIT_MMX
RESET_MM_PERMUTATION
UNSPILL r0, 0,1,2,3,4,5,6,7
call dct8_mmx
SPILL r0, 1,2,3,5,7
ret
INIT_MMX
ALIGN 16
idct8_mmx:
cglobal idct8_mmx
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SAVE_MM_PERMUTATION idct8_mmx
SAVE_MM_PERMUTATION
ret
%macro ADD_STORE_ROW 3
......@@ -330,12 +326,12 @@ global add8x8_idct8_mmx.skip_prologue
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
%macro DCT_SUB8 1
cglobal sub8x8_dct_%1, 3,3
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3
add r2, 4*FDEC_STRIDE
global sub8x8_dct_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
%ifnidn %1, sse2
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
......@@ -364,11 +360,18 @@ global sub8x8_dct_%1.skip_prologue
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8_%1, 3,3
cglobal sub8x8_dct8, 3,3
add r2, 4*FDEC_STRIDE
global sub8x8_dct8_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
%ifidn %1, sse2
%if cpuflag(ssse3)
mova m7, [hsub_mul]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 0,1
%else
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
......@@ -379,13 +382,6 @@ global sub8x8_dct8_%1.skip_prologue
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%else
mova m7, [hsub_mul]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 0,1
%endif
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
......@@ -396,25 +392,24 @@ global sub8x8_dct8_%1.skip_prologue
ret
%endmacro
INIT_XMM
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8 sse2
DCT_SUB8
%undef movdqa
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
INIT_AVX
DCT_SUB8 avx
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 1
cglobal add8x8_idct_%1, 2,2
%macro ADD8x8 0
cglobal add8x8_idct, 2,2
add r0, 4*FDEC_STRIDE
global add8x8_idct_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
......@@ -447,18 +442,18 @@ global add8x8_idct_%1.skip_prologue
ret
%endmacro ; ADD8x8
INIT_XMM
ADD8x8 sse2
INIT_AVX
ADD8x8 avx
INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 1
cglobal add8x8_idct8_%1, 2,2
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2
add r0, 4*FDEC_STRIDE
global add8x8_idct8_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
......@@ -478,8 +473,8 @@ global add8x8_idct8_%1.skip_prologue
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM
ADD8x8_IDCT8 sse2
INIT_AVX
ADD8x8_IDCT8 avx
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%endif ; !HIGH_BIT_DEPTH
......@@ -34,7 +34,6 @@ SECTION .text
%ifndef HIGH_BIT_DEPTH
cextern pw_32
cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
......@@ -133,17 +132,17 @@ INIT_XMM
SWAP %3, %8, %7
%endmacro
%macro DCT_SUB8 1
cglobal sub8x8_dct_%1, 3,3,11
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
global sub8x8_dct_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
......@@ -161,16 +160,16 @@ global sub8x8_dct_%1.skip_prologue
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8_%1, 3,3,11
cglobal sub8x8_dct8, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
global sub8x8_dct8_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
......@@ -189,29 +188,29 @@ global sub8x8_dct8_%1.skip_prologue
ret
%endmacro
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8 sse2
DCT_SUB8
%undef movdqa
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
INIT_AVX
DCT_SUB8 avx
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 1
cglobal add8x8_idct8_%1, 2,2,11
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global add8x8_idct8_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
......@@ -234,23 +233,23 @@ global add8x8_idct8_%1.skip_prologue
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM
ADD8x8_IDCT8 sse2
INIT_AVX
ADD8x8_IDCT8 avx
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 1
cglobal add8x8_idct_%1, 2,2,11
%macro ADD8x8 0
cglobal add8x8_idct, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global add8x8_idct_%1.skip_prologue
global current_function %+ .skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
......@@ -281,8 +280,8 @@ global add8x8_idct_%1.skip_prologue
ret
%endmacro ; ADD8x8
INIT_XMM
ADD8x8 sse2
INIT_AVX
ADD8x8 avx
INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
%endif ; !HIGH_BIT_DEPTH
......@@ -89,8 +89,8 @@ cextern pd_32
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
%macro DCT4x4_DC 1
cglobal dct4x4dc_%1, 1,1,5
%macro DCT4x4_DC 0
cglobal dct4x4dc, 1,1,5
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
......@@ -110,14 +110,14 @@ cglobal dct4x4dc_%1, 1,1,5
RET
%endmacro ; DCT4x4_DC
INIT_XMM
DCT4x4_DC sse2
INIT_AVX
DCT4x4_DC avx
INIT_XMM sse2
DCT4x4_DC
INIT_XMM avx
DCT4x4_DC
%else
INIT_MMX
cglobal dct4x4dc_mmx, 1,1
INIT_MMX mmx
cglobal dct4x4dc, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
......@@ -141,8 +141,8 @@ cglobal dct4x4dc_mmx, 1,1
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
%macro IDCT4x4DC 1
cglobal idct4x4dc_%1, 1,1
%macro IDCT4x4DC 0
cglobal idct4x4dc, 1,1
mova m3, [r0+48]
mova m2, [r0+32]
mova m1, [r0+16]
......@@ -157,17 +157,17 @@ cglobal idct4x4dc_%1, 1,1
RET
%endmacro ; IDCT4x4DC
INIT_XMM
IDCT4x4DC sse2
INIT_AVX
IDCT4x4DC avx
INIT_XMM sse2
IDCT4x4DC
INIT_XMM avx
IDCT4x4DC
%else
INIT_MMX
;-----------------------------------------------------------------------------
; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal idct4x4dc_mmx, 1,1
INIT_MMX mmx
cglobal idct4x4dc, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
......@@ -182,12 +182,12 @@ cglobal idct4x4dc_mmx, 1,1
RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
cglobal sub4x4_dct_mmx, 3,3
INIT_MMX mmx
cglobal sub4x4_dct, 3,3
.skip_prologue:
LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
......@@ -216,18 +216,13 @@ cglobal sub4x4_dct_mmx, 3,3
RET
%else
%macro SUB_DCT4 1
cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
%macro SUB_DCT4 0
cglobal sub4x4_dct, 3,3
.skip_prologue:
LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
%else
%if cpuflag(ssse3)
mova m5, [hsub_mul]
LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
%endif
LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
DCT4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
DCT4_1D 0,1,2,3,4
......@@ -238,8 +233,10 @@ cglobal sub4x4_dct_%1, 3,3
RET
%endmacro
SUB_DCT4 mmx
SUB_DCT4 ssse3
INIT_MMX mmx
SUB_DCT4
INIT_MMX ssse3
SUB_DCT4
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
......@@ -258,8 +255,8 @@ SUB_DCT4 ssse3
movhps %6, %1
%endmacro
%macro ADD4x4_IDCT 1
cglobal add4x4_idct_%1, 2,2,6
%macro ADD4x4_IDCT 0
cglobal add4x4_idct, 2,2,6
add r0, 2*FDEC_STRIDEB
.skip_prologue:
mova m1, [r1+16]
......@@ -276,14 +273,15 @@ cglobal add4x4_idct_%1, 2,2,6
RET
%endmacro
INIT_XMM
ADD4x4_IDCT sse2
INIT_AVX
ADD4x4_IDCT avx
INIT_XMM sse2
ADD4x4_IDCT
INIT_XMM avx
ADD4x4_IDCT
%else ; !HIGH_BIT_DEPTH
cglobal add4x4_idct_mmx, 2,2
INIT_MMX mmx
cglobal add4x4_idct, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
......@@ -300,8 +298,8 @@ cglobal add4x4_idct_mmx, 2,2
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
%macro ADD4x4 1
cglobal add4x4_idct_%1, 2,2,6
%macro ADD4x4 0
cglobal add4x4_idct, 2,2,6
mova m1, [r1+0x00] ; row1/row0
mova m3, [r1+0x10] ; row3/row2
psraw m0, m1, 1 ; row1>>1/...
......@@ -350,10 +348,10 @@ cglobal add4x4_idct_%1, 2,2,6
RET
%endmacro ; ADD4x4
INIT_XMM
ADD4x4 sse4
INIT_AVX
ADD4x4 avx
INIT_XMM sse4
ADD4x4
INIT_XMM avx
ADD4x4
%endif ; HIGH_BIT_DEPTH
INIT_MMX
......@@ -374,24 +372,24 @@ cglobal %1, 3,3,11*(mmsize/16)
%ifdef WIN64
sub rsp, 8
%endif
call %2
call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
call %2
call %2.skip_prologue
add r0, %3
add r1, (%4-%6)*FENC_STRIDE-%5-%4
add r2, (%4-%6)*FDEC_STRIDE-%5-%4
call %2
call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
call %2
call %2.skip_prologue
add rsp, 8
RET
%else
jmp %2
jmp %2.skip_prologue
%endif
%endmacro
......@@ -412,75 +410,74 @@ cglobal %1, 2,2,11*(mmsize/16)
%ifdef WIN64
sub rsp, 8
%endif
call %2
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
call %2
call %2.skip_prologue
add r0, (%4-%6)*FDEC_STRIDE-%5-%4
add r1, %3
call %2
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
%ifdef WIN64
call %2
call %2.skip_prologue
add rsp, 8
RET
%else
jmp %2
jmp %2.skip_prologue
%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
INIT_AVX
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx.skip_prologue, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx ,add8x8_idct_avx.skip_prologue, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif
INIT_XMM
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx.skip_prologue, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0