Commit 9168abfa authored by Loren Merritt's avatar Loren Merritt

faster x86_32 dct8

parent 56bf7565
......@@ -415,14 +415,11 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_SSE2 )
{
#ifdef ARCH_X86_64
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
#endif
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -37,15 +37,6 @@ SECTION .text
psubw %2, %1
%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
......@@ -61,330 +52,91 @@ SECTION .text
SWAP %2, %3
%endmacro
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
%macro LOAD_DIFF_8P 7
movq %1, %5
movq %2, %1
punpcklbw %1, %7
punpckhbw %2, %7
movq %3, %6
movq %4, %3
punpcklbw %3, %7
punpckhbw %4, %7
psubw %1, %3
psubw %2, %4
%endmacro
%macro LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
movq %2, %3
movq %1, %4
SUMSUB_BA %1, %2
%macro LOAD_DIFF_8P 4
movh %1, %3
movh %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro STORE_DIFF_8P 4
psraw %1, 6
movq %3, %2
movh %3, %2
punpcklbw %3, %4
paddsw %1, %3
packuswb %1, %1
movq %2, %1
movh %2, %1
%endmacro
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
SUMSUB_BA m%8, m%1 ; %8 = s07, %1 = d07
SUMSUB_BA m%7, m%2 ; %7 = s16, %2 = d16
SUMSUB_BA m%6, m%3 ; %6 = s25, %3 = d25
SUMSUB_BA m%5, m%4 ; %5 = s34, %4 = d34
SUMSUB_BA m%5, m%8 ; %5 = a0, %8 = a2
SUMSUB_BA m%6, m%7 ; %6 = a1, %7 = a3
SUMSUB_BA m%6, m%5 ; %6 = dst0, %5 = dst4
mova [%9+0x00], m%6
mova [%9+0x40], m%5
mova m%6, m%7 ; a3
psraw m%6, 1 ; a3>>1
paddw m%6, m%8 ; a2 + (a3>>1)
psraw m%8, 1 ; a2>>1
psubw m%8, m%7 ; (a2>>1) - a3
mova [%9+0x60], m%8
mova m%5, m%3
psraw m%5, 1
paddw m%5, m%3 ; d25+(d25>>1)
mova m%7, m%1
psubw m%7, m%4 ; a5 = d07-d34-(d25+(d25>>1))
psubw m%7, m%5
mova m%5, m%2
psraw m%5, 1
paddw m%5, m%2 ; d16+(d16>>1)
mova m%8, m%1
paddw m%8, m%4
psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
mova m%5, m%1
psraw m%5, 1
paddw m%5, m%1 ; d07+(d07>>1)
paddw m%5, m%2
paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
mova m%1, m%4
psraw m%1, 1
paddw m%1, m%4 ; d34+(d34>>1)
paddw m%1, m%2
psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
mova m%4, m%1
psraw m%4, 2
paddw m%4, m%5 ; a4 + (a7>>2)
mova m%3, m%8
psraw m%3, 2
paddw m%3, m%7 ; a5 + (a6>>2)
psraw m%5, 2
psraw m%7, 2
psubw m%5, m%1 ; (a4>>2) - a7
psubw m%8, m%7 ; a6 - (a5>>2)
SWAP %2, %4, %3, %6, %8, %5
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
;-----------------------------------------------------------------------------
ALIGN 16
x264_pixel_sub_8x8_mmx:
pxor mm7, mm7
%assign i 0
%rep 8
LOAD_DIFF_8P mm0, mm1, mm2, mm3, [r1], [r2], mm7
movq [r0+i], mm0
movq [r0+i+8], mm1
add r1, FENC_STRIDE
add r2, FDEC_STRIDE
%assign i i+16
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_ydct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
ALIGN 16
x264_ydct8_mmx:
;-------------------------------------------------------------------------
; vertical dct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign i 0
%rep 2
LOADSUMSUB mm2, mm3, [r0+i+0*16], [r0+i+7*16] ; mm2 = s07, mm3 = d07
LOADSUMSUB mm1, mm5, [r0+i+1*16], [r0+i+6*16] ; mm1 = s16, mm5 = d16
LOADSUMSUB mm0, mm6, [r0+i+2*16], [r0+i+5*16] ; mm0 = s25, mm6 = d25
LOADSUMSUB mm4, mm7, [r0+i+3*16], [r0+i+4*16] ; mm4 = s34, mm7 = d34
SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
movq [r0+i+0*16], mm0
movq [r0+i+4*16], mm4
movq mm0, mm1 ; a3
psraw mm0, 1 ; a3>>1
paddw mm0, mm2 ; a2 + (a3>>1)
psraw mm2, 1 ; a2>>1
psubw mm2, mm1 ; (a2>>1) - a3
movq [r0+i+2*16], mm0
movq [r0+i+6*16], mm2
movq mm0, mm6
psraw mm0, 1
paddw mm0, mm6 ; d25+(d25>>1)
movq mm1, mm3
psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
psubw mm1, mm0
movq mm0, mm5
psraw mm0, 1
paddw mm0, mm5 ; d16+(d16>>1)
movq mm2, mm3
paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
psubw mm2, mm0
movq mm0, mm3
psraw mm0, 1
paddw mm0, mm3 ; d07+(d07>>1)
paddw mm0, mm5
paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
movq mm3, mm7
psraw mm3, 1
paddw mm3, mm7 ; d34+(d34>>1)
paddw mm3, mm5
psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
movq mm7, mm3
psraw mm7, 2
paddw mm7, mm0 ; a4 + (a7>>2)
movq mm6, mm2
psraw mm6, 2
paddw mm6, mm1 ; a5 + (a6>>2)
psraw mm0, 2
psraw mm1, 2
psubw mm0, mm3 ; (a4>>2) - a7
psubw mm2, mm1 ; a6 - (a5>>2)
movq [r0+i+1*16], mm7
movq [r0+i+3*16], mm6
movq [r0+i+5*16], mm2
movq [r0+i+7*16], mm0
%assign i i+8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_yidct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
ALIGN 16
x264_yidct8_mmx:
;-------------------------------------------------------------------------
; vertical idct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign i 0
%rep 2
movq mm1, [r0+i+1*16] ; mm1 = d1
movq mm3, [r0+i+3*16] ; mm3 = d3
movq mm5, [r0+i+5*16] ; mm5 = d5
movq mm7, [r0+i+7*16] ; mm7 = d7
movq mm4, mm7
psraw mm4, 1
movq mm0, mm5
psubw mm0, mm7
psubw mm0, mm4
psubw mm0, mm3 ; mm0 = e1
movq mm6, mm3
psraw mm6, 1
movq mm2, mm7
psubw mm2, mm6
psubw mm2, mm3
paddw mm2, mm1 ; mm2 = e3
movq mm4, mm5
psraw mm4, 1
paddw mm4, mm5
paddw mm4, mm7
psubw mm4, mm1 ; mm4 = e5
movq mm6, mm1
psraw mm6, 1
paddw mm6, mm1
paddw mm6, mm5
paddw mm6, mm3 ; mm6 = e7
movq mm1, mm0
movq mm3, mm4
movq mm5, mm2
movq mm7, mm6
psraw mm6, 2
psraw mm3, 2
psraw mm5, 2
psraw mm0, 2
paddw mm1, mm6 ; mm1 = f1
paddw mm3, mm2 ; mm3 = f3
psubw mm5, mm4 ; mm5 = f5
psubw mm7, mm0 ; mm7 = f7
movq mm2, [r0+i+2*16] ; mm2 = d2
movq mm6, [r0+i+6*16] ; mm6 = d6
movq mm4, mm2
movq mm0, mm6
psraw mm4, 1
psraw mm6, 1
psubw mm4, mm0 ; mm4 = a4
paddw mm6, mm2 ; mm6 = a6
movq mm2, [r0+i+0*16] ; mm2 = d0
movq mm0, [r0+i+4*16] ; mm0 = d4
SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
; mm4 = f2, mm2 = f4
SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
; mm5 = g1, mm4 = g6
SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
; mm1 = g3, mm0 = g4
movq [r0+i+0*16], mm7
movq [r0+i+1*16], mm5
movq [r0+i+2*16], mm3
movq [r0+i+3*16], mm1
movq [r0+i+4*16], mm0
movq [r0+i+5*16], mm2
movq [r0+i+6*16], mm4
movq [r0+i+7*16], mm6
%assign i i+8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
ALIGN 16
x264_pixel_add_8x8_mmx:
pxor mm7, mm7
%assign i 0
%rep 8
movq mm0, [r0]
movq mm2, [r1+i]
movq mm3, [r1+i+8]
movq mm1, mm0
psraw mm2, 6
psraw mm3, 6
punpcklbw mm0, mm7
punpckhbw mm1, mm7
paddw mm0, mm2
paddw mm1, mm3
packuswb mm0, mm1
movq [r0], mm0
add r0, FDEC_STRIDE
%assign i i+16
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_transpose_8x8_mmx( int16_t src[8][8] );
;-----------------------------------------------------------------------------
ALIGN 16
x264_transpose_8x8_mmx:
movq m0, [r0 ]
movq m1, [r0+ 16]
movq m2, [r0+ 32]
movq m3, [r0+ 48]
TRANSPOSE4x4W 0,1,2,3,4
movq [r0 ], m0
movq [r0+ 16], m1
movq [r0+ 32], m2
movq [r0+ 48], m3
movq m0, [r0+ 72]
movq m1, [r0+ 88]
movq m2, [r0+104]
movq m3, [r0+120]
TRANSPOSE4x4W 0,1,2,3,4
movq [r0+ 72], m0
movq [r0+ 88], m1
movq [r0+104], m2
movq [r0+120], m3
movq m0, [r0+ 8]
movq m1, [r0+ 24]
movq m2, [r0+ 40]
movq m3, [r0+ 56]
TRANSPOSE4x4W 0,1,2,3,4
movq m4, [r0+ 64]
movq m5, [r0+ 80]
movq m6, [r0+ 96]
movq m7, [r0+112]
movq [r0+ 64], m0
movq [r0+ 80], m1
movq [r0+ 96], m2
movq [r0+112], m3
TRANSPOSE4x4W 4,5,6,7,0
movq [r0+ 8], m4
movq [r0+ 24], m5
movq [r0+ 40], m6
movq [r0+ 56], m7
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
call x264_pixel_sub_8x8_mmx
call x264_ydct8_mmx
call x264_transpose_8x8_mmx
jmp x264_ydct8_mmx
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 0,1
mov r0, r1m
add word [r0], 32
call x264_yidct8_mmx
call x264_transpose_8x8_mmx
call x264_yidct8_mmx
mov r1, r0
mov r0, r0m
jmp x264_pixel_add_8x8_mmx
INIT_XMM
%macro IDCT8_1D 8
movdqa m%1, m%3
movdqa m%5, m%7
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
mova m%1, m%3
mova m%5, m%7
psraw m%3, 1
psraw m%7, 1
psubw m%3, m%5
paddw m%7, m%1
movdqa m%5, m%2
mova m%5, m%2
psraw m%5, 1
paddw m%5, m%2
paddw m%5, m%4
paddw m%5, m%6
movdqa m%1, m%6
mova m%1, m%6
psraw m%1, 1
paddw m%1, m%6
paddw m%1, m%8
......@@ -397,8 +149,8 @@ INIT_XMM
psraw m%8, 1
psubw m%2, m%4
psubw m%6, m%8
movdqa m%4, m%5
movdqa m%8, m%1
mova m%4, m%5
mova m%8, m%1
psraw m%4, 2
psraw m%8, 2
paddw m%4, m%6
......@@ -407,8 +159,8 @@ INIT_XMM
psraw m%2, 2
psubw m%5, m%6
psubw m%2, m%1
movdqa m%1, [r1+0x00]
movdqa m%6, [r1+0x40]
mova m%1, [%9+0x00]
mova m%6, [%9+0x40]
SUMSUB_BA m%6, m%1
SUMSUB_BA m%7, m%6
SUMSUB_BA m%3, m%1
......@@ -420,19 +172,227 @@ INIT_XMM
SWAP %3, %8, %7
%endmacro
; in: m0..m7
; out: all except m4, which is in [%9+0x40]
INIT_MMX
ALIGN 16
load_diff_4x8_mmx:
LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
movq [r0], m0
LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
movq m0, [r0]
ret
INIT_MMX
ALIGN 16
dct8_mmx:
DCT8_1D 0,1,2,3,4,5,6,7,r0
SAVE_MM_PERMUTATION dct8_mmx
ret
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova [%%base + %2*16], %%tmp
%rotate 1-%0/2
%endrep
%endmacro
%macro UNSPILL_SHUFFLE 3-*
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova %%tmp, [%%base + %2*16]
%rotate 1-%0/2
%endrep
%endmacro
%macro SPILL 2+ ; assume offsets are the same as reg numbers
SPILL_SHUFFLE %1, %2, %2
%endmacro
%macro UNSPILL 2+
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
global x264_sub8x8_dct8_mmx %+ .skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
call dct8_mmx
UNSPILL r0, 0
TRANSPOSE4x4W 0,1,2,3,4
SPILL r0, 0,1,2,3
UNSPILL r0, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0, 4,5,6,7
INIT_MMX
add r1, 4
add r2, 4
add r0, 8
call load_diff_4x8_mmx
sub r1, 4
sub r2, 4
call dct8_mmx
sub r0, 8
UNSPILL r0+8, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0+8, 4,5,6,7
UNSPILL r0+8, 0
TRANSPOSE4x4W 0,1,2,3,5
UNSPILL r0, 4,5,6,7
SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
movq mm4, m6 ; depends on the permutation to not produce conflicts
movq mm0, m4
movq mm1, m5
movq mm2, mm4
movq mm3, m7
INIT_MMX
UNSPILL r0+8, 4,5,6,7
add r0, 8
call dct8_mmx
sub r0, 8
SPILL r0+8, 1,2,3,5,7
INIT_MMX
UNSPILL r0, 0,1,2,3,4,5,6,7
call dct8_mmx
SPILL r0, 1,2,3,5,7
ret
INIT_MMX
ALIGN 16
idct8_mmx:
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SAVE_MM_PERMUTATION idct8_mmx
ret
%macro ADD_STORE_ROW 3
movq m1, [r0+%1*FDEC_STRIDE]
movq m2, m1
punpcklbw m1, m0
punpckhbw m2, m0
paddw m1, %2
paddw m2, %3
packuswb m1, m2
movq [r0+%1*FDEC_STRIDE], m1
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2
global x264_add8x8_idct8_mmx %+ .skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
UNSPILL r1, 1,2,3,5,6,7
call idct8_mmx
SPILL r1, 7
TRANSPOSE4x4W 0,1,2,3,7
SPILL r1, 0,1,2,3
UNSPILL r1, 7
TRANSPOSE4x4W 4,5,6,7,0
SPILL r1, 4,5,6,7
INIT_MMX
UNSPILL r1+8, 1,2,3,5,6,7
add r1, 8
call idct8_mmx
sub r1, 8
SPILL r1+8, 7
TRANSPOSE4x4W 0,1,2,3,7
SPILL r1+8, 0,1,2,3
UNSPILL r1+8, 7
TRANSPOSE4x4W 4,5,6,7,0
SPILL r1+8, 4,5,6,7
INIT_MMX
movq m3, [r1+0x08]
movq m0, [r1+0x40]
movq [r1+0x40], m3
movq [r1+0x08], m0
; memory layout at this time:
; A0------ A1------
; B0------ F0------
; C0------ G0------
; D0------ H0------
; E0------ E1------
; B1------ F1------
; C1------ G1------
; D1------ H1------
UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
UNSPILL r1+8, 5,6,7
add r1, 8
call idct8_mmx
sub r1, 8
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
psraw m4, 6
psraw m5, 6
psraw m6, 6
psraw m7, 6
movq [r1+0x08], m0 ; mm4
movq [r1+0x48], m4 ; mm5
movq [r1+0x58], m5 ; mm0
movq [r1+0x68], m6 ; mm2
movq [r1+0x78], m7 ; mm6
movq mm5, [r1+0x18]
movq mm6, [r1+0x28]
movq [r1+0x18], m1 ; mm1
movq [r1+0x28], m2 ; mm7
movq mm7, [r1+0x38]
movq [r1+0x38], m3 ; mm3
movq mm1, [r1+0x10]
movq mm2, [r1+0x20]
movq mm3, [r1+0x30]
call idct8_mmx
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
psraw m4, 6
psraw m5, 6
psraw m6, 6
psraw m7, 6
SPILL r1, 0,1,2
pxor m0, m0
ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
ADD_STORE_ROW 3, m3, [r1+0x38]
ADD_STORE_ROW 4, m4, [r1+0x48]
ADD_STORE_ROW 5, m5, [r1+0x58]
ADD_STORE_ROW 6, m6, [r1+0x68]
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
INIT_XMM
; in: m0..m7, except m6 which is in [%9+0x60]
; out: m0..m7, except m4 which is in [%9+0x40]
%macro TRANSPOSE8x8W 9
movdqa [%9], m%8
SBUTTERFLY wd, %1, %2, %8
SBUTTERFLY wd, %1, %2, %7
movdqa [%9+16], m%2
movdqa m%8, [%9]
movdqa m%7, [%9+0x60]
SBUTTERFLY wd, %3, %4, %2
SBUTTERFLY wd, %5, %6, %2
SBUTTERFLY wd, %7, %8, %2
SBUTTERFLY dq, %1, %3, %2
movdqa [%9], m%3
movdqa m%2, [16+%9]
movdqa m%2, [%9+16]
SBUTTERFLY dq, %2, %4, %3
SBUTTERFLY dq, %5, %7, %3
SBUTTERFLY dq, %6, %8, %3
......@@ -446,25 +406,45 @@ INIT_XMM
SWAP %4, %7
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue
.skip_prologue:
LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
SPILL r0, 0
LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
UNSPILL r0, 0
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 4
DCT8_1D 0,1,2,3,4,5,6,7,r0
SPILL r0, 1,2,3,5,7